movie_crawler 0.1.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3f6ebe9cba57dd662d4f04a78fc0891ad3c1f77f
4
- data.tar.gz: 4e63f3330a19ebae78af35c75dc64c201e2ad72d
3
+ metadata.gz: 0e40451420c351951666d34b022d5a1fb79c31e6
4
+ data.tar.gz: 2caf81767ea7b73af694629c149c07e763921c31
5
5
  SHA512:
6
- metadata.gz: 5843495e2bb5b0a9b5bad3ad57e995484d62e843f46ff0787c773ba9cb3320a028277032ad438c2bc59396a5484492691e06d43769a1981982cb3ec70d897285
7
- data.tar.gz: 5e2eb53601932cc76f510770dd4c834f24f0afe1fa76543f1372a3dce82e3dc765e6e2f13a13826f4456b3b4474c043d2c4b8ec4b5849af7db32495c9adefd46
6
+ metadata.gz: 292e4344a628490095120a074e1b5c28fa624b4f7a986f2fbb51d301f5989ac3adb8f4727aa2d007115c7fb135cf8726e119e79d54487f7b0595ed7f3902ad8a
7
+ data.tar.gz: a41689f0cc79363c52ad158c4ab45327d0995474fab5ef5ba94d581532f99850e8bb63ab39c8c6a8c98e81695fe941f6d9d7cf7286ebea64bcf085790f4ef379
Binary file
data/README.md CHANGED
@@ -8,7 +8,9 @@ MovieInfo tries to grabs some information on the [**@movies**](www.atmovies.com.
8
8
 
9
9
  ## About
10
10
 
11
- If you want check the movie schedule in Taiwan.And choose which to see at the weekend. The gem will provide you with the current films including first, second and recommend movie list. Also allowing you check the description and ranking list in the specific one
11
+ If you want check the movie schedule in Taiwan.And choose which to see at the weekend. The gem will provide you with the current films including first, second and recommend movie list. Also allowing you check the description and ranking list in the specific one.
12
+
13
+ 0.2.1 new feature: add the function get_movie_info(movie_name) which will return the detail information of the movie if it is accessible from the atmovie.
12
14
 
13
15
  ## Usage
14
16
  This gem could be used as a command line utility or called from code
@@ -19,11 +21,11 @@ movie_crawler
19
21
  ### code example:
20
22
 
21
23
  require 'movie_crawler'
22
-
23
- movie_list = MovieInfo.movies('FIRST_ROUND') # 'LATEST' or 'SECOND_ROUND'
24
+
25
+ movie_list = movies('FIRST_ROUND') # 'LATEST' or 'SECOND_ROUND'
24
26
  puts movie_list
25
-
26
- dvd_rank = MovieInfo.dvd_rank
27
+
28
+ dvd_rank = dvd_rank
27
29
  puts dvd_rank
28
30
 
29
31
  ## Format
@@ -39,4 +41,3 @@ movie_crawler
39
41
  **runtime(minutes):** '96'
40
42
 
41
43
  **trailer:** http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id=fmus01587310
42
-
data/bin/app CHANGED
@@ -1,20 +1,18 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'movie_crawler'
3
+ # require File.expand_path('../../lib/movie_crawler.rb', __FILE__)
3
4
 
4
5
  puts "\ndvd_rank:"
5
- puts MovieCrawler::MovieInfo.dvd_rank
6
+ puts MovieCrawler.dvd_rank
6
7
 
7
8
  puts "\nus_weekend_rank:"
8
- puts MovieCrawler::MovieInfo.us_weekend
9
+ puts MovieCrawler.us_weekend
9
10
 
10
11
  puts "\ntaipei_weekend_rank:"
11
- puts MovieCrawler::MovieInfo.taipei_weekend
12
+ puts MovieCrawler.taipei_weekend
12
13
 
13
14
  puts "\nlastest_movie_list:"
14
- puts MovieCrawler::MovieInfo.movies('LATEST')
15
-
16
- puts "\nfirst_round_movie_list_:"
17
- puts MovieCrawler::MovieInfo.movies('FIRST_ROUND')
15
+ puts MovieCrawler.movies('LATEST')
18
16
 
19
17
  puts "\nsecond_round_movie_list:"
20
- puts MovieCrawler::MovieInfo.movies('SECOND_ROUND')
18
+ puts MovieCrawler.movies('SECOND_ROUND')
Binary file
@@ -1,2 +1,3 @@
1
+ # require File.expand_path('../movie_crawler/crawler.rb', __FILE__)
1
2
  require 'movie_crawler/crawler.rb'
2
- require 'movie_crawler/version.rb'
3
+ require 'movie_crawler/version.rb'
@@ -2,162 +2,215 @@ require 'nokogiri'
2
2
  require 'open-uri'
3
3
  require 'yaml'
4
4
  require 'iconv'
5
+ require 'mechanize'
5
6
 
6
7
  module MovieCrawler
7
8
  # get the info from atmovies
8
- class MovieInfo
9
-
10
- URL_LIST = {
11
- 'LATEST' => 'http://www.atmovies.com.tw/movie/movie_new.html',
12
- 'SECOND_ROUND' => 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
13
- # first_round is unnessary, the result is the same as latest.
14
- }
15
-
16
- MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
17
- WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
18
- WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
19
- WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
20
- WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
21
- REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
22
- REFLECTION_FS = '&fs=2'
23
- REFLECTION_CLASS = "//div[@class = 'act01']"
24
- REFLECTION_SATITLE = '&satitle='
25
- REFLECTION_SAID = '&said='
26
- REFLECTION_NAME = "//span[@class = 'at21b']"
27
- TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
28
- ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
29
-
30
- # add three rank parser
31
- def self.us_weekend
32
- result = get_table('1')
33
- to_yaml(result)
34
- end
9
+ URL_LIST = {
10
+ 'LATEST' => 'http://www.atmovies.com.tw/movie/movie_new.html',
11
+ 'SECOND_ROUND' => 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
12
+ # first_round is unnessary, the result is the same as latest.
13
+ }
14
+
15
+ MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
16
+ WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
17
+ WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
18
+ WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
19
+ WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
20
+ REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
21
+ REFLECTION_FS = '&fs=2'
22
+ REFLECTION_CLASS = "//div[@class = 'act01']"
23
+ REFLECTION_SATITLE = '&satitle='
24
+ REFLECTION_SAID = '&said='
25
+ REFLECTION_NAME = "//span[@class = 'at21b']"
26
+ TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
27
+ ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
28
+ SEARCH_URL = 'http://search.atmovies.com.tw/search/search.cfm'
29
+
30
+ # start a Mechanize agent for crawling from web to web
31
+ def self.start_mechan
32
+ agent = Mechanize.new
33
+ agent.user_agent_alias = 'Mac Safari'
34
+ agent.post(SEARCH_URL)
35
+ search_page = agent.post(SEARCH_URL)
36
+ search_page.form_with(action: 'search.cfm')
37
+ end
35
38
 
36
- def self.taipei_weekend
37
- result = get_table('2')
38
- to_yaml(result)
39
- end
39
+ # finish a post request to search for specific film
40
+ def self.trace_page(form, movie_name)
41
+ form.search_term = movie_name # movie name
42
+ search_result = form.submit(form.button_with(name: 'search'))
43
+ link = search_result.link_with(href: /F&d=/)
44
+ link.click
45
+ end
40
46
 
41
- def self.dvd_rank
42
- result = get_table('3')
43
- to_yaml(result)
47
+ def self.parse_comment(film_page)
48
+ begin
49
+ comment_data = film_page.at('.comment_data').text
50
+ .gsub(/[\t\r\n]+/, "\t")
51
+ .strip.split("\t")
52
+ rescue
53
+ comment_data = 'not exised'
44
54
  end
55
+ end
45
56
 
46
- # parse the ranktable info
47
- def self.get_table(rankid)
48
- doc = open_html(ATMOVIES_MAIN_URL)
49
- table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
50
- table = table.gsub(' : ', ':').gsub(' ', '').split
51
- table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
52
- table.pop
53
- rankmix(table)
54
- end
57
+ def self.parse_crew(film_page)
58
+ crew_row = film_page.at('.crew_row').text.gsub(/[\t\r\n]+/, "\t").strip.split(/[\t+]/)
59
+ crew_row = crew_row.collect { |a| a.strip }
60
+ crew_row -= ['']
61
+ end
55
62
 
56
- # mix the rank info
57
- def self.rankmix(t)
58
- t.each_with_index.map do |_, index|
59
- {
60
- index + 1 => t[index].to_s
61
- }
62
- end
63
- end
63
+ # parse the film info in the target web page
64
+ def self.parse_movie_info(film_page)
65
+ name = film_page.at('.name .at21b').text.strip
66
+ schedule = film_page.at('#movie_info01 b').text.strip
67
+ sub_content = film_page.at('.sub_content').text.strip
68
+ crew_row = parse_crew(film_page)
69
+ comment_data = parse_comment(film_page)
70
+ { 'name' => name, 'schedule' => schedule,
71
+ 'crew_info' => crew_row, 'comment' => comment_data,
72
+ 'content' => sub_content }
73
+ end
64
74
 
65
- # switch to different url accordingly
66
- def self.movies(category = 'LATEST')
67
- result = movies_parser(category)
68
- to_yaml(result)
69
- end
75
+ # combine the workflow for user to call for movie info
76
+ def self.get_movie_info(movie_name)
77
+ form = start_mechan
78
+ film_page = trace_page(form, movie_name)
79
+ collection = parse_movie_info(film_page)
80
+ to_yaml(collection)
81
+ end
70
82
 
71
- # parse the movies acoordingly
72
- def self.movies_parser(category)
73
- url = URL_LIST[category.upcase]
74
- document = open_html(url)
75
- titles = get_titles(document)
76
- stories = get_stories(document)
77
- dates = get_dates(document)
78
- trailers = get_trailer(document)
79
- runtimes = get_runtime(document)
80
- mix(titles, stories, dates, runtimes, trailers)
81
- end
83
+ # add three rank parser
84
+ def self.us_weekend
85
+ result = get_table('1')
86
+ to_yaml(result)
87
+ end
82
88
 
83
- def self.encode_zh(text)
84
- REFLECTION_SATITLE + URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
85
- end
89
+ def self.taipei_weekend
90
+ result = get_table('2')
91
+ to_yaml(result)
92
+ end
86
93
 
87
- def self.get_one_movie_name(doc)
88
- name = doc.xpath(REFLECTION_NAME).text
89
- name.gsub!(/[\t\r\n]/, '')
90
- end
94
+ def self.dvd_rank
95
+ result = get_table('3')
96
+ to_yaml(result)
97
+ end
91
98
 
92
- def self.get_reflection(doc)
93
- doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
94
- end
99
+ # parse the ranktable info
100
+ def self.get_table(rankid)
101
+ doc = open_html(ATMOVIES_MAIN_URL)
102
+ table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
103
+ table = table.gsub(' : ', ':').gsub(' ', '').split
104
+ table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
105
+ table.pop
106
+ rankmix(table)
107
+ end
95
108
 
96
- # get the details of movie
97
- def self.movie_details(code)
98
- open_html(MOVIE_BASE_URL + code + '/')
109
+ # mix the rank info
110
+ def self.rankmix(t)
111
+ t.each_with_index.map do |_, index|
112
+ {
113
+ index + 1 => t[index].to_s
114
+ }
99
115
  end
116
+ end
100
117
 
101
- # open the destination url
102
- def self.open_html(url)
103
- Nokogiri::HTML(open(url))
104
- end
118
+ # switch to different url accordingly
119
+ def self.movies(category = 'LATEST')
120
+ result = movies_parser(category)
121
+ to_yaml(result)
122
+ end
105
123
 
106
- # get the movie name
107
- def self.get_titles(doc)
108
- titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
109
- titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
110
- end
124
+ # parse the movies acoordingly
125
+ def self.movies_parser(category)
126
+ url = URL_LIST[category.upcase]
127
+ document = open_html(url)
128
+ titles = get_titles(document)
129
+ stories = get_stories(document)
130
+ dates = get_dates(document)
131
+ trailers = get_trailer(document)
132
+ runtimes = get_runtime(document)
133
+ mix(titles, stories, dates, runtimes, trailers)
134
+ end
111
135
 
112
- # get the storyline of movie
113
- def self.get_stories(doc)
114
- storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
115
- storylines.map(&:text) # { |story| story.text }
116
- end
136
+ def self.encode_zh(text)
137
+ encoded = URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
138
+ REFLECTION_SATITLE + encoded
139
+ end
117
140
 
118
- # get the runtime of movie
119
- def self.get_runtime(doc)
120
- days_times = split_day_and_time(doc)
121
- days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
122
- end
141
+ def self.get_one_movie_name(doc)
142
+ name = doc.xpath(REFLECTION_NAME).text
143
+ name.gsub!(/[\t\r\n]/, '')
144
+ end
123
145
 
124
- # get the release date
125
- def self.get_dates(doc)
126
- days_times = split_day_and_time(doc)
127
- days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
128
- end
146
+ def self.get_reflection(doc)
147
+ doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
148
+ end
129
149
 
130
- def self.split_day_and_time(doc)
131
- gap = "\n\t\t\t\t\s\s\s\s\t"
132
- days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
133
- days_times.map { |d_t| d_t.text.split(gap) }
134
- end
150
+ # get the details of movie
151
+ def self.movie_details(code)
152
+ open_html(MOVIE_BASE_URL + code + '/')
153
+ end
135
154
 
136
- # get the code of movies
137
- def self.get_codes(doc)
138
- codes = doc.xpath(WHOLE_MOVIEWS_CODES)
139
- codes.map { |code| code.value.split('/')[2] }
140
- end
155
+ # open the destination url
156
+ def self.open_html(url)
157
+ Nokogiri::HTML(open(url))
158
+ end
141
159
 
142
- # get the trailer link of the movies
143
- def self.get_trailer(doc)
144
- codes = get_codes(doc)
145
- codes.map { |trailer| TRAILER_URL + trailer }
146
- end
160
+ # get the movie name
161
+ def self.get_titles(doc)
162
+ titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
163
+ titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
164
+ end
147
165
 
148
- # build the hash for yaml output
149
- def self.mix(t, s, d, ti, tr)
150
- informations = t.each_with_index.map do |_, index|
151
- { 'title' => t[index], 'story' => s[index], \
152
- 'date' => d[index], 'runtime(minutes)' => ti[index], \
153
- 'trailer' => tr[index] }
154
- end
155
- informations
156
- end
166
+ # get the storyline of movie
167
+ def self.get_stories(doc)
168
+ storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
169
+ storylines.map(&:text) # { |story| story.text }
170
+ end
157
171
 
158
- # convert the schedules to yaml format
159
- def self.to_yaml(mix)
160
- mix.to_yaml
172
+ # get the runtime of movie
173
+ def self.get_runtime(doc)
174
+ days_times = split_day_and_time(doc)
175
+ days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
176
+ end
177
+
178
+ # get the release date
179
+ def self.get_dates(doc)
180
+ days_times = split_day_and_time(doc)
181
+ days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
182
+ end
183
+
184
+ def self.split_day_and_time(doc)
185
+ gap = "\n\t\t\t\t\s\s\s\s\t"
186
+ days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
187
+ days_times.map { |d_t| d_t.text.split(gap) }
188
+ end
189
+
190
+ # get the code of movies
191
+ def self.get_codes(doc)
192
+ codes = doc.xpath(WHOLE_MOVIEWS_CODES)
193
+ codes.map { |code| code.value.split('/')[2] }
194
+ end
195
+
196
+ # get the trailer link of the movies
197
+ def self.get_trailer(doc)
198
+ codes = get_codes(doc)
199
+ codes.map { |trailer| TRAILER_URL + trailer }
200
+ end
201
+
202
+ # build the hash for yaml output
203
+ def self.mix(t, s, d, ti, tr)
204
+ informations = t.each_with_index.map do |_, index|
205
+ { 'title' => t[index], 'story' => s[index], \
206
+ 'date' => d[index], 'runtime(minutes)' => ti[index], \
207
+ 'trailer' => tr[index] }
161
208
  end
209
+ informations
210
+ end
211
+
212
+ # convert the schedules to yaml format
213
+ def self.to_yaml(mix)
214
+ mix.to_yaml
162
215
  end
163
216
  end
@@ -1,3 +1,3 @@
1
1
  module MovieCrawler
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.2'
3
3
  end
@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
4
4
  s.name = 'movie_crawler'
5
5
  s.version = MovieCrawler::VERSION
6
6
  s.executables << 'app'
7
- s.date = '2014-10-30'
7
+ s.date = '2014-11-02'
8
8
  s.summary = 'Grab the movies information from the atmovies.com'
9
9
  s.description = 'Grab the movies information from the atmovies.com'
10
10
  s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
@@ -17,4 +17,5 @@ Gem::Specification.new do |s|
17
17
  s.add_development_dependency 'minitest-rg'
18
18
  s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
19
19
  s.add_runtime_dependency 'iconv'
20
+ s.add_runtime_dependency 'mechanize'
20
21
  end
@@ -2,13 +2,13 @@ require 'minitest/autorun'
2
2
  require 'minitest/rg'
3
3
  require File.expand_path('../../lib/movie_crawler', __FILE__)
4
4
 
5
- LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
6
- SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
5
+ # LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
6
+ # SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
7
7
  TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
8
8
 
9
9
  # generate a random number to test either latest or second_round
10
- rand < 0.5 ? url = LATEST : url = SECOND_ROUND
11
- sample = MovieCrawler::MovieInfo.movies_parser(url)
10
+ rand < 0.5 ? url = 'LATEST' : url = 'SECOND_ROUND'
11
+ sample = MovieCrawler.movies_parser(url)
12
12
 
13
13
  describe 'movies_parser should involve' do
14
14
 
@@ -4,7 +4,7 @@ require File.expand_path('../../lib/movie_crawler', __FILE__)
4
4
 
5
5
  # 1 to 3 means diffent test case related to us, taipei, dvd
6
6
  rand_rank = rand(1..3)
7
- sample = MovieInfo.get_table(rand_rank.to_s)
7
+ sample = MovieCrawler.get_table(rand_rank.to_s)
8
8
 
9
9
  describe 'table should involve' do
10
10
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: movie_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lee Chen
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-10-30 00:00:00.000000000 Z
13
+ date: 2014-11-02 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: minitest
@@ -68,6 +68,20 @@ dependencies:
68
68
  - - ">="
69
69
  - !ruby/object:Gem::Version
70
70
  version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: mechanize
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
71
85
  description: Grab the movies information from the atmovies.com
72
86
  email: chung1350@hotmail.com
73
87
  executables:
@@ -75,13 +89,16 @@ executables:
75
89
  extensions: []
76
90
  extra_rdoc_files: []
77
91
  files:
92
+ - ".DS_Store"
78
93
  - ".gitignore"
79
94
  - ".travis.yml"
80
95
  - Gemfile
81
96
  - README.md
82
97
  - Rakefile
83
98
  - bin/app
99
+ - lib/.DS_Store
84
100
  - lib/movie_crawler.rb
101
+ - lib/movie_crawler/.DS_Store
85
102
  - lib/movie_crawler/crawler.rb
86
103
  - lib/movie_crawler/version.rb
87
104
  - movie_crawler-0.1.0.gem