movie_crawler 0.1.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.DS_Store +0 -0
- data/README.md +7 -6
- data/bin/app +6 -8
- data/lib/.DS_Store +0 -0
- data/lib/movie_crawler.rb +2 -1
- data/lib/movie_crawler/.DS_Store +0 -0
- data/lib/movie_crawler/crawler.rb +184 -131
- data/lib/movie_crawler/version.rb +1 -1
- data/movie_crawler.gemspec +2 -1
- data/spec/movies_spec.rb +4 -4
- data/spec/rank_spec.rb +1 -1
- metadata +19 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e40451420c351951666d34b022d5a1fb79c31e6
|
4
|
+
data.tar.gz: 2caf81767ea7b73af694629c149c07e763921c31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 292e4344a628490095120a074e1b5c28fa624b4f7a986f2fbb51d301f5989ac3adb8f4727aa2d007115c7fb135cf8726e119e79d54487f7b0595ed7f3902ad8a
|
7
|
+
data.tar.gz: a41689f0cc79363c52ad158c4ab45327d0995474fab5ef5ba94d581532f99850e8bb63ab39c8c6a8c98e81695fe941f6d9d7cf7286ebea64bcf085790f4ef379
|
data/.DS_Store
ADDED
Binary file
|
data/README.md
CHANGED
@@ -8,7 +8,9 @@ MovieInfo tries to grabs some information on the [**@movies**](www.atmovies.com.
|
|
8
8
|
|
9
9
|
## About
|
10
10
|
|
11
|
-
If you want check the movie schedule in Taiwan.And choose which to see at the weekend. The gem will provide you with the current films including first, second and recommend movie list. Also allowing you check the description and ranking list in the specific one
|
11
|
+
If you want check the movie schedule in Taiwan.And choose which to see at the weekend. The gem will provide you with the current films including first, second and recommend movie list. Also allowing you check the description and ranking list in the specific one.
|
12
|
+
|
13
|
+
0.2.1 new feature: add the function get_movie_info(movie_name) which will return the detail information of the movie if it is accessible from the atmovie.
|
12
14
|
|
13
15
|
## Usage
|
14
16
|
This gem could be used as a command line utility or called from code
|
@@ -19,11 +21,11 @@ movie_crawler
|
|
19
21
|
### code example:
|
20
22
|
|
21
23
|
require 'movie_crawler'
|
22
|
-
|
23
|
-
movie_list =
|
24
|
+
|
25
|
+
movie_list = movies('FIRST_ROUND') # 'LATEST' or 'SECOND_ROUND'
|
24
26
|
puts movie_list
|
25
|
-
|
26
|
-
dvd_rank =
|
27
|
+
|
28
|
+
dvd_rank = dvd_rank
|
27
29
|
puts dvd_rank
|
28
30
|
|
29
31
|
## Format
|
@@ -39,4 +41,3 @@ movie_crawler
|
|
39
41
|
**runtime(minutes):** '96'
|
40
42
|
|
41
43
|
**trailer:** http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id=fmus01587310
|
42
|
-
|
data/bin/app
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'movie_crawler'
|
3
|
+
# require File.expand_path('../../lib/movie_crawler.rb', __FILE__)
|
3
4
|
|
4
5
|
puts "\ndvd_rank:"
|
5
|
-
puts MovieCrawler
|
6
|
+
puts MovieCrawler.dvd_rank
|
6
7
|
|
7
8
|
puts "\nus_weekend_rank:"
|
8
|
-
puts MovieCrawler
|
9
|
+
puts MovieCrawler.us_weekend
|
9
10
|
|
10
11
|
puts "\ntaipei_weekend_rank:"
|
11
|
-
puts MovieCrawler
|
12
|
+
puts MovieCrawler.taipei_weekend
|
12
13
|
|
13
14
|
puts "\nlastest_movie_list:"
|
14
|
-
puts MovieCrawler
|
15
|
-
|
16
|
-
puts "\nfirst_round_movie_list_:"
|
17
|
-
puts MovieCrawler::MovieInfo.movies('FIRST_ROUND')
|
15
|
+
puts MovieCrawler.movies('LATEST')
|
18
16
|
|
19
17
|
puts "\nsecond_round_movie_list:"
|
20
|
-
puts MovieCrawler
|
18
|
+
puts MovieCrawler.movies('SECOND_ROUND')
|
data/lib/.DS_Store
ADDED
Binary file
|
data/lib/movie_crawler.rb
CHANGED
Binary file
|
@@ -2,162 +2,215 @@ require 'nokogiri'
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'yaml'
|
4
4
|
require 'iconv'
|
5
|
+
require 'mechanize'
|
5
6
|
|
6
7
|
module MovieCrawler
|
7
8
|
# get the info from atmovies
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
9
|
+
URL_LIST = {
|
10
|
+
'LATEST' => 'http://www.atmovies.com.tw/movie/movie_new.html',
|
11
|
+
'SECOND_ROUND' => 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
12
|
+
# first_round is unnessary, the result is the same as latest.
|
13
|
+
}
|
14
|
+
|
15
|
+
MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
|
16
|
+
WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
|
17
|
+
WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
|
18
|
+
WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
|
19
|
+
WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
|
20
|
+
REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
|
21
|
+
REFLECTION_FS = '&fs=2'
|
22
|
+
REFLECTION_CLASS = "//div[@class = 'act01']"
|
23
|
+
REFLECTION_SATITLE = '&satitle='
|
24
|
+
REFLECTION_SAID = '&said='
|
25
|
+
REFLECTION_NAME = "//span[@class = 'at21b']"
|
26
|
+
TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
|
27
|
+
ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
|
28
|
+
SEARCH_URL = 'http://search.atmovies.com.tw/search/search.cfm'
|
29
|
+
|
30
|
+
# start a Mechanize agent for crawling from web to web
|
31
|
+
def self.start_mechan
|
32
|
+
agent = Mechanize.new
|
33
|
+
agent.user_agent_alias = 'Mac Safari'
|
34
|
+
agent.post(SEARCH_URL)
|
35
|
+
search_page = agent.post(SEARCH_URL)
|
36
|
+
search_page.form_with(action: 'search.cfm')
|
37
|
+
end
|
35
38
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
39
|
+
# finish a post request to search for specific film
|
40
|
+
def self.trace_page(form, movie_name)
|
41
|
+
form.search_term = movie_name # movie name
|
42
|
+
search_result = form.submit(form.button_with(name: 'search'))
|
43
|
+
link = search_result.link_with(href: /F&d=/)
|
44
|
+
link.click
|
45
|
+
end
|
40
46
|
|
41
|
-
|
42
|
-
|
43
|
-
|
47
|
+
def self.parse_comment(film_page)
|
48
|
+
begin
|
49
|
+
comment_data = film_page.at('.comment_data').text
|
50
|
+
.gsub(/[\t\r\n]+/, "\t")
|
51
|
+
.strip.split("\t")
|
52
|
+
rescue
|
53
|
+
comment_data = 'not exised'
|
44
54
|
end
|
55
|
+
end
|
45
56
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
|
52
|
-
table.pop
|
53
|
-
rankmix(table)
|
54
|
-
end
|
57
|
+
def self.parse_crew(film_page)
|
58
|
+
crew_row = film_page.at('.crew_row').text.gsub(/[\t\r\n]+/, "\t").strip.split(/[\t+]/)
|
59
|
+
crew_row = crew_row.collect { |a| a.strip }
|
60
|
+
crew_row -= ['']
|
61
|
+
end
|
55
62
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
63
|
+
# parse the film info in the target web page
|
64
|
+
def self.parse_movie_info(film_page)
|
65
|
+
name = film_page.at('.name .at21b').text.strip
|
66
|
+
schedule = film_page.at('#movie_info01 b').text.strip
|
67
|
+
sub_content = film_page.at('.sub_content').text.strip
|
68
|
+
crew_row = parse_crew(film_page)
|
69
|
+
comment_data = parse_comment(film_page)
|
70
|
+
{ 'name' => name, 'schedule' => schedule,
|
71
|
+
'crew_info' => crew_row, 'comment' => comment_data,
|
72
|
+
'content' => sub_content }
|
73
|
+
end
|
64
74
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
75
|
+
# combine the workflow for user to call for movie info
|
76
|
+
def self.get_movie_info(movie_name)
|
77
|
+
form = start_mechan
|
78
|
+
film_page = trace_page(form, movie_name)
|
79
|
+
collection = parse_movie_info(film_page)
|
80
|
+
to_yaml(collection)
|
81
|
+
end
|
70
82
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
stories = get_stories(document)
|
77
|
-
dates = get_dates(document)
|
78
|
-
trailers = get_trailer(document)
|
79
|
-
runtimes = get_runtime(document)
|
80
|
-
mix(titles, stories, dates, runtimes, trailers)
|
81
|
-
end
|
83
|
+
# add three rank parser
|
84
|
+
def self.us_weekend
|
85
|
+
result = get_table('1')
|
86
|
+
to_yaml(result)
|
87
|
+
end
|
82
88
|
|
83
|
-
|
84
|
-
|
85
|
-
|
89
|
+
def self.taipei_weekend
|
90
|
+
result = get_table('2')
|
91
|
+
to_yaml(result)
|
92
|
+
end
|
86
93
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
94
|
+
def self.dvd_rank
|
95
|
+
result = get_table('3')
|
96
|
+
to_yaml(result)
|
97
|
+
end
|
91
98
|
|
92
|
-
|
93
|
-
|
94
|
-
|
99
|
+
# parse the ranktable info
|
100
|
+
def self.get_table(rankid)
|
101
|
+
doc = open_html(ATMOVIES_MAIN_URL)
|
102
|
+
table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
|
103
|
+
table = table.gsub(' : ', ':').gsub(' ', '').split
|
104
|
+
table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
|
105
|
+
table.pop
|
106
|
+
rankmix(table)
|
107
|
+
end
|
95
108
|
|
96
|
-
|
97
|
-
|
98
|
-
|
109
|
+
# mix the rank info
|
110
|
+
def self.rankmix(t)
|
111
|
+
t.each_with_index.map do |_, index|
|
112
|
+
{
|
113
|
+
index + 1 => t[index].to_s
|
114
|
+
}
|
99
115
|
end
|
116
|
+
end
|
100
117
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
118
|
+
# switch to different url accordingly
|
119
|
+
def self.movies(category = 'LATEST')
|
120
|
+
result = movies_parser(category)
|
121
|
+
to_yaml(result)
|
122
|
+
end
|
105
123
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
124
|
+
# parse the movies acoordingly
|
125
|
+
def self.movies_parser(category)
|
126
|
+
url = URL_LIST[category.upcase]
|
127
|
+
document = open_html(url)
|
128
|
+
titles = get_titles(document)
|
129
|
+
stories = get_stories(document)
|
130
|
+
dates = get_dates(document)
|
131
|
+
trailers = get_trailer(document)
|
132
|
+
runtimes = get_runtime(document)
|
133
|
+
mix(titles, stories, dates, runtimes, trailers)
|
134
|
+
end
|
111
135
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
end
|
136
|
+
def self.encode_zh(text)
|
137
|
+
encoded = URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
|
138
|
+
REFLECTION_SATITLE + encoded
|
139
|
+
end
|
117
140
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
end
|
141
|
+
def self.get_one_movie_name(doc)
|
142
|
+
name = doc.xpath(REFLECTION_NAME).text
|
143
|
+
name.gsub!(/[\t\r\n]/, '')
|
144
|
+
end
|
123
145
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
|
128
|
-
end
|
146
|
+
def self.get_reflection(doc)
|
147
|
+
doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
|
148
|
+
end
|
129
149
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
end
|
150
|
+
# get the details of movie
|
151
|
+
def self.movie_details(code)
|
152
|
+
open_html(MOVIE_BASE_URL + code + '/')
|
153
|
+
end
|
135
154
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
end
|
155
|
+
# open the destination url
|
156
|
+
def self.open_html(url)
|
157
|
+
Nokogiri::HTML(open(url))
|
158
|
+
end
|
141
159
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
160
|
+
# get the movie name
|
161
|
+
def self.get_titles(doc)
|
162
|
+
titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
|
163
|
+
titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
|
164
|
+
end
|
147
165
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
'trailer' => tr[index] }
|
154
|
-
end
|
155
|
-
informations
|
156
|
-
end
|
166
|
+
# get the storyline of movie
|
167
|
+
def self.get_stories(doc)
|
168
|
+
storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
|
169
|
+
storylines.map(&:text) # { |story| story.text }
|
170
|
+
end
|
157
171
|
|
158
|
-
|
159
|
-
|
160
|
-
|
172
|
+
# get the runtime of movie
|
173
|
+
def self.get_runtime(doc)
|
174
|
+
days_times = split_day_and_time(doc)
|
175
|
+
days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
|
176
|
+
end
|
177
|
+
|
178
|
+
# get the release date
|
179
|
+
def self.get_dates(doc)
|
180
|
+
days_times = split_day_and_time(doc)
|
181
|
+
days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
|
182
|
+
end
|
183
|
+
|
184
|
+
def self.split_day_and_time(doc)
|
185
|
+
gap = "\n\t\t\t\t\s\s\s\s\t"
|
186
|
+
days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
|
187
|
+
days_times.map { |d_t| d_t.text.split(gap) }
|
188
|
+
end
|
189
|
+
|
190
|
+
# get the code of movies
|
191
|
+
def self.get_codes(doc)
|
192
|
+
codes = doc.xpath(WHOLE_MOVIEWS_CODES)
|
193
|
+
codes.map { |code| code.value.split('/')[2] }
|
194
|
+
end
|
195
|
+
|
196
|
+
# get the trailer link of the movies
|
197
|
+
def self.get_trailer(doc)
|
198
|
+
codes = get_codes(doc)
|
199
|
+
codes.map { |trailer| TRAILER_URL + trailer }
|
200
|
+
end
|
201
|
+
|
202
|
+
# build the hash for yaml output
|
203
|
+
def self.mix(t, s, d, ti, tr)
|
204
|
+
informations = t.each_with_index.map do |_, index|
|
205
|
+
{ 'title' => t[index], 'story' => s[index], \
|
206
|
+
'date' => d[index], 'runtime(minutes)' => ti[index], \
|
207
|
+
'trailer' => tr[index] }
|
161
208
|
end
|
209
|
+
informations
|
210
|
+
end
|
211
|
+
|
212
|
+
# convert the schedules to yaml format
|
213
|
+
def self.to_yaml(mix)
|
214
|
+
mix.to_yaml
|
162
215
|
end
|
163
216
|
end
|
data/movie_crawler.gemspec
CHANGED
@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
|
|
4
4
|
s.name = 'movie_crawler'
|
5
5
|
s.version = MovieCrawler::VERSION
|
6
6
|
s.executables << 'app'
|
7
|
-
s.date = '2014-
|
7
|
+
s.date = '2014-11-02'
|
8
8
|
s.summary = 'Grab the movies information from the atmovies.com'
|
9
9
|
s.description = 'Grab the movies information from the atmovies.com'
|
10
10
|
s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
|
@@ -17,4 +17,5 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.add_development_dependency 'minitest-rg'
|
18
18
|
s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
|
19
19
|
s.add_runtime_dependency 'iconv'
|
20
|
+
s.add_runtime_dependency 'mechanize'
|
20
21
|
end
|
data/spec/movies_spec.rb
CHANGED
@@ -2,13 +2,13 @@ require 'minitest/autorun'
|
|
2
2
|
require 'minitest/rg'
|
3
3
|
require File.expand_path('../../lib/movie_crawler', __FILE__)
|
4
4
|
|
5
|
-
LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
|
6
|
-
SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
5
|
+
# LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
|
6
|
+
# SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
7
7
|
TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
|
8
8
|
|
9
9
|
# generate a random number to test either latest or second_round
|
10
|
-
rand < 0.5 ? url = LATEST : url = SECOND_ROUND
|
11
|
-
sample = MovieCrawler
|
10
|
+
rand < 0.5 ? url = 'LATEST' : url = 'SECOND_ROUND'
|
11
|
+
sample = MovieCrawler.movies_parser(url)
|
12
12
|
|
13
13
|
describe 'movies_parser should involve' do
|
14
14
|
|
data/spec/rank_spec.rb
CHANGED
@@ -4,7 +4,7 @@ require File.expand_path('../../lib/movie_crawler', __FILE__)
|
|
4
4
|
|
5
5
|
# 1 to 3 means diffent test case related to us, taipei, dvd
|
6
6
|
rand_rank = rand(1..3)
|
7
|
-
sample =
|
7
|
+
sample = MovieCrawler.get_table(rand_rank.to_s)
|
8
8
|
|
9
9
|
describe 'table should involve' do
|
10
10
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: movie_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lee Chen
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-11-02 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: minitest
|
@@ -68,6 +68,20 @@ dependencies:
|
|
68
68
|
- - ">="
|
69
69
|
- !ruby/object:Gem::Version
|
70
70
|
version: '0'
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: mechanize
|
73
|
+
requirement: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
type: :runtime
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
71
85
|
description: Grab the movies information from the atmovies.com
|
72
86
|
email: chung1350@hotmail.com
|
73
87
|
executables:
|
@@ -75,13 +89,16 @@ executables:
|
|
75
89
|
extensions: []
|
76
90
|
extra_rdoc_files: []
|
77
91
|
files:
|
92
|
+
- ".DS_Store"
|
78
93
|
- ".gitignore"
|
79
94
|
- ".travis.yml"
|
80
95
|
- Gemfile
|
81
96
|
- README.md
|
82
97
|
- Rakefile
|
83
98
|
- bin/app
|
99
|
+
- lib/.DS_Store
|
84
100
|
- lib/movie_crawler.rb
|
101
|
+
- lib/movie_crawler/.DS_Store
|
85
102
|
- lib/movie_crawler/crawler.rb
|
86
103
|
- lib/movie_crawler/version.rb
|
87
104
|
- movie_crawler-0.1.0.gem
|