movie_crawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2d8c82453127b3fdba97d8ddbd5058ccb4ddc0f6
4
+ data.tar.gz: 0d5910dac4e714107db946f8f2b2768ea273c13d
5
+ SHA512:
6
+ metadata.gz: 44a3bd6395d47b8ef7eae0ba323a657aef0db394700fb5175cf87590a2ed392ef5821b932a0a7df31a3229396f714576dc9870c0ee7799b40cc071326ecc0525
7
+ data.tar.gz: b9af1c5e8b28a244d0b86288b5fe0c026ad25a6264dd29fc1966db4695e75b7ef1ab4a3254bd79b2a91fcf988f318c0053408b0da730c3e54252bad2602ac4a0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'nokogiri'
4
+ gem 'iconv'
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ NiceSchedule
2
+ ===============
3
+
4
+ NiceSchedule tries to grabs some information on the [**Niceday**](http://plan.niceday.tw).
5
+
6
+ ## About
7
+
8
+ Once you have no idea on where to go on weekend, the application will give you some inspiration.
9
+
10
+ ## Usage
11
+
12
+ Copy the following on command line.
13
+
14
+ (Choose the directory you want to put it)
15
+ ````
16
+ $ git clone git@github.com:ChenLiZhan/SOA-Crawler.git
17
+ ````
18
+
19
+ Then type,
20
+ ````
21
+ $ ruby app.rb
22
+ ````
23
+
24
+ ## Format
25
+
26
+ *Ex :*
27
+
28
+ **title:** 花蓮行(下雨版)
29
+
30
+ **days:** 3
31
+
32
+ **route:** 起點:陳記狀元粥鋪>松園別館>花蓮縣石雕博物館>公正包子店>一心泡泡冰>花蓮鐵道文化園區>周家蒸餃>賴桑壽司屋>時光二手書屋>林田山林業文化園區>滿妹豬腳>綠茶肉圓>瑞穗溫泉>慶修院>戴記扁食店>曾記麻糬>終點
33
+
34
+ **link:** http://plan.niceday.tw/trip/view/id/20973
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ task default: [:spec]
4
+
5
+ desc 'Run specs'
6
+ task :spec do
7
+ sh 'ruby -I lib spec/*_spec.rb'
8
+ end
data/bin/app ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ require 'movie_crawler'
3
+
4
+ puts MovieInfo.dvd_rank
5
+ puts MovieInfo.us_weekend
6
+ puts MovieInfo.taipei_weekend
7
+ puts MovieInfo.movies('LATEST') # 'first_round','second_round
8
+ puts MovieInfo.movies('FIRST_ROUND')
9
+ puts MovieInfo.movies('SECOND_ROUND')
@@ -0,0 +1,166 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'yaml'
4
+ require 'iconv'
5
+
6
+ module MovieCrawler
7
+ # get the info from atmovies
8
+ class MovieInfo
9
+ LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
10
+ MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
11
+ WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
12
+ WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
13
+ WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
14
+ WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
15
+ REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
16
+ REFLECTION_FS = '&fs=2'
17
+ REFLECTION_CLASS = "//div[@class = 'act01']"
18
+ REFLECTION_SATITLE = '&satitle='
19
+ REFLECTION_SAID = '&said='
20
+ REFLECTION_NAME = "//span[@class = 'at21b']"
21
+ TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
22
+ FIRST_ROUND = 'http://www.atmovies.com.tw/movie/movie_now-1.html'
23
+ SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
24
+ ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
25
+
26
+ # add three rank parser
27
+ def self.us_weekend
28
+ result = get_table('1')
29
+ to_yaml(result)
30
+ end
31
+
32
+ def self.taipei_weekend
33
+ result = get_table('2')
34
+ to_yaml(result)
35
+ end
36
+
37
+ def self.dvd_rank
38
+ result = get_table('3')
39
+ to_yaml(result)
40
+ end
41
+
42
+ # parse the ranktable info
43
+ def self.get_table(rankid)
44
+ doc = open_html(ATMOVIES_MAIN_URL)
45
+ table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
46
+ table = table.gsub(' : ', ':').gsub(' ', '').split
47
+ table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
48
+ table.pop
49
+ rankmix(table)
50
+ end
51
+
52
+ # mix the rank info
53
+ def self.rankmix(t)
54
+ t.each_with_index.map do |_, index|
55
+ {
56
+ index + 1 => t[index].to_s
57
+ }
58
+ end
59
+ end
60
+
61
+ # switch to different url accordingly
62
+ def self.movies(category = 'LATEST')
63
+ case category.upcase
64
+ when 'LATEST'
65
+ url = LATEST
66
+ when 'FIRST_ROUND'
67
+ url = FIRST_ROUND
68
+ when 'SECOND_ROUND'
69
+ url = SECOND_ROUND
70
+ end
71
+ result = movies_parser(url)
72
+ to_yaml(result)
73
+ end
74
+
75
+ # parse the movies acoordingly
76
+ def self.movies_parser(url)
77
+ document = open_html(url)
78
+ titles = get_titles(document)
79
+ stories = get_stories(document)
80
+ dates = get_dates(document)
81
+ trailers = get_trailer(document)
82
+ runtimes = get_runtime(document)
83
+ mix(titles, stories, dates, runtimes, trailers)
84
+ end
85
+
86
+ def self.encode_zh(text)
87
+ REFLECTION_SATITLE + URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
88
+ end
89
+
90
+ def self.get_one_movie_name(doc)
91
+ name = doc.xpath(REFLECTION_NAME).text
92
+ name.gsub!(/[\t\r\n]/, '')
93
+ end
94
+
95
+ def self.get_reflection(doc)
96
+ doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
97
+ end
98
+
99
+ # get the details of movie
100
+ def self.movie_details(code)
101
+ open_html(MOVIE_BASE_URL + code + '/')
102
+ end
103
+
104
+ # open the destination url
105
+ def self.open_html(url)
106
+ Nokogiri::HTML(open(url))
107
+ end
108
+
109
+ # get the movie name
110
+ def self.get_titles(doc)
111
+ titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
112
+ titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
113
+ end
114
+
115
+ # get the storyline of movie
116
+ def self.get_stories(doc)
117
+ storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
118
+ storylines.map(&:text) # { |story| story.text }
119
+ end
120
+
121
+ # get the runtime of movie
122
+ def self.get_runtime(doc)
123
+ days_times = split_day_and_time(doc)
124
+ days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
125
+ end
126
+
127
+ # get the release date
128
+ def self.get_dates(doc)
129
+ days_times = split_day_and_time(doc)
130
+ days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
131
+ end
132
+
133
+ def self.split_day_and_time(doc)
134
+ gap = "\n\t\t\t\t\s\s\s\s\t"
135
+ days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
136
+ days_times.map { |d_t| d_t.text.split(gap) }
137
+ end
138
+
139
+ # get the code of movies
140
+ def self.get_codes(doc)
141
+ codes = doc.xpath(WHOLE_MOVIEWS_CODES)
142
+ codes.map { |code| code.value.split('/')[2] }
143
+ end
144
+
145
+ # get the trailer link of the movies
146
+ def self.get_trailer(doc)
147
+ codes = get_codes(doc)
148
+ codes.map { |trailer| TRAILER_URL + trailer }
149
+ end
150
+
151
+ # build the hash for yaml output
152
+ def self.mix(t, s, d, ti, tr)
153
+ informations = t.each_with_index.map do |_, index|
154
+ { 'title' => t[index], 'story' => s[index], \
155
+ 'date' => d[index], 'runtime(minutes)' => ti[index], \
156
+ 'trailer' => tr[index] }
157
+ end
158
+ informations
159
+ end
160
+
161
+ # convert the schedules to yaml format
162
+ def self.to_yaml(mix)
163
+ mix.to_yaml
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,3 @@
1
+ module MovieCrawler
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,2 @@
1
+ require 'movie_crawler/crawler.rb'
2
+ require 'movie_crawler/version.rb'
@@ -0,0 +1,20 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require 'movie_crawler/version'
3
+ Gem::Specification.new do |s|
4
+ s.name = 'movie_crawler'
5
+ s.version = MovieCrawler::VERSION
6
+ s.executables << 'app'
7
+ s.date = '2014-10-25'
8
+ s.summary = 'Grab the movies information from the atmovies.com'
9
+ s.description = 'Grab the movies information from the atmovies.com'
10
+ s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
11
+ s.email = 'chung1350@hotmail.com'
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+ s.homepage = 'https://github.com/ChenLiZhan/SOA-Crawler'
15
+ s.license = 'MIT'
16
+ s.add_development_dependency 'minitest'
17
+ s.add_development_dependency 'minitest-rg'
18
+ s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
19
+ s.add_runtime_dependency 'iconv'
20
+ end
@@ -0,0 +1,39 @@
1
+ require 'minitest/autorun'
2
+ require File.expand_path('../../lib/crawler', __FILE__)
3
+
4
+ LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
5
+ SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
6
+ TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
7
+
8
+ # generate a random number to test either latest or second_round
9
+ rand < 0.5 ? url = LATEST : url = SECOND_ROUND
10
+ sample = MovieInfo.movies_parser(url)
11
+
12
+ describe 'movies_parser should involve' do
13
+
14
+ it 'must be non-empty' do
15
+ sample.wont_be_empty
16
+ end
17
+
18
+ sample.each do |each_head|
19
+ it 'title must be string' do
20
+ each_head['title'].must_be_instance_of String
21
+ end
22
+
23
+ it 'story must be string' do
24
+ each_head['story'].must_be_instance_of String
25
+ end
26
+
27
+ it 'date must match the format' do
28
+ each_head['date'].must_match %r{\d+/\d+/\d+}
29
+ end
30
+
31
+ it 'runtime must be either empty or digit numbers' do
32
+ each_head['runtime(minutes)'].must_match(/\A|\d+/)
33
+ end
34
+
35
+ it 'trailer link must accessible' do
36
+ each_head['trailer'].must_include TRAILER_URL
37
+ end
38
+ end
39
+ end
data/spec/rank_spec.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/rg'
3
+ require File.expand_path('../../lib/crawler', __FILE__)
4
+
5
+ # 1 to 3 means diffent test case related to us, taipei, dvd
6
+ rand_rank = rand(1..3)
7
+ sample = MovieInfo.get_table(rand_rank.to_s)
8
+
9
+ describe 'table should involve' do
10
+
11
+ it 'must be non-empty' do
12
+ sample.wont_be_empty
13
+ end
14
+
15
+ it 'size must be ten' do
16
+ sample.size.must_equal 10
17
+ end
18
+
19
+ sample.each do |each_movie|
20
+
21
+ it 'each_movie must be non-empty' do
22
+ each_movie.wont_be_empty
23
+ end
24
+
25
+ it 'each_key must be string' do
26
+ each_movie.each_value { |key| key.must_be_instance_of String }
27
+ end
28
+
29
+ # Hash to_s would be like "{index=>\"movie_name\"}"
30
+ it 'each_key must match the format' do
31
+ each_movie.to_s.must_match %r{\{\d+=>\"\W+\"\}}
32
+ end
33
+ end
34
+
35
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: movie_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Lee Chen
8
+ - Chen Hung Tu
9
+ - David Yang
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2014-10-25 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: minitest
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: minitest-rg
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :development
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: nokogiri
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.6.2
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 1.6.2
57
+ - !ruby/object:Gem::Dependency
58
+ name: iconv
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ description: Grab the movies information from the atmovies.com
72
+ email: chung1350@hotmail.com
73
+ executables:
74
+ - app
75
+ extensions: []
76
+ extra_rdoc_files: []
77
+ files:
78
+ - Gemfile
79
+ - README.md
80
+ - Rakefile
81
+ - bin/app
82
+ - lib/movie_crawler.rb
83
+ - lib/movie_crawler/crawler.rb
84
+ - lib/movie_crawler/version.rb
85
+ - movie_crawler.gemspec
86
+ - spec/movies_spec.rb
87
+ - spec/rank_spec.rb
88
+ homepage: https://github.com/ChenLiZhan/SOA-Crawler
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.4.2
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Grab the movies information from the atmovies.com
112
+ test_files:
113
+ - spec/movies_spec.rb
114
+ - spec/rank_spec.rb