movie_crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2d8c82453127b3fdba97d8ddbd5058ccb4ddc0f6
4
+ data.tar.gz: 0d5910dac4e714107db946f8f2b2768ea273c13d
5
+ SHA512:
6
+ metadata.gz: 44a3bd6395d47b8ef7eae0ba323a657aef0db394700fb5175cf87590a2ed392ef5821b932a0a7df31a3229396f714576dc9870c0ee7799b40cc071326ecc0525
7
+ data.tar.gz: b9af1c5e8b28a244d0b86288b5fe0c026ad25a6264dd29fc1966db4695e75b7ef1ab4a3254bd79b2a91fcf988f318c0053408b0da730c3e54252bad2602ac4a0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'nokogiri'
4
+ gem 'iconv'
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ NiceSchedule
2
+ ===============
3
+
4
+ NiceSchedule tries to grabs some information on the [**Niceday**](http://plan.niceday.tw).
5
+
6
+ ## About
7
+
8
+ Once you have no idea on where to go on weekend, the application will give you some inspiration.
9
+
10
+ ## Usage
11
+
12
+ Copy the following on command line.
13
+
14
+ (Choose the directory you want to put it)
15
+ ````
16
+ $ git clone git@github.com:ChenLiZhan/SOA-Crawler.git
17
+ ````
18
+
19
+ Then type,
20
+ ````
21
+ $ ruby app.rb
22
+ ````
23
+
24
+ ## Format
25
+
26
+ *Ex :*
27
+
28
+ **title:** 花蓮行(下雨版)
29
+
30
+ **days:** 3
31
+
32
+ **route:** 起點:陳記狀元粥鋪>松園別館>花蓮縣石雕博物館>公正包子店>一心泡泡冰>花蓮鐵道文化園區>周家蒸餃>賴桑壽司屋>時光二手書屋>林田山林業文化園區>滿妹豬腳>綠茶肉圓>瑞穗溫泉>慶修院>戴記扁食店>曾記麻糬>終點
33
+
34
+ **link:** http://plan.niceday.tw/trip/view/id/20973
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ task default: [:spec]
4
+
5
+ desc 'Run specs'
6
+ task :spec do
7
+ sh 'ruby -I lib spec/*_spec.rb'
8
+ end
data/bin/app ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ require 'movie_crawler'
3
+
4
+ puts MovieInfo.dvd_rank
5
+ puts MovieInfo.us_weekend
6
+ puts MovieInfo.taipei_weekend
7
+ puts MovieInfo.movies('LATEST') # 'first_round','second_round
8
+ puts MovieInfo.movies('FIRST_ROUND')
9
+ puts MovieInfo.movies('SECOND_ROUND')
@@ -0,0 +1,166 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'yaml'
4
+ require 'iconv'
5
+
6
+ module MovieCrawler
7
+ # get the info from atmovies
8
+ class MovieInfo
9
+ LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
10
+ MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
11
+ WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
12
+ WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
13
+ WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
14
+ WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
15
+ REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
16
+ REFLECTION_FS = '&fs=2'
17
+ REFLECTION_CLASS = "//div[@class = 'act01']"
18
+ REFLECTION_SATITLE = '&satitle='
19
+ REFLECTION_SAID = '&said='
20
+ REFLECTION_NAME = "//span[@class = 'at21b']"
21
+ TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
22
+ FIRST_ROUND = 'http://www.atmovies.com.tw/movie/movie_now-1.html'
23
+ SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
24
+ ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
25
+
26
+ # add three rank parser
27
+ def self.us_weekend
28
+ result = get_table('1')
29
+ to_yaml(result)
30
+ end
31
+
32
+ def self.taipei_weekend
33
+ result = get_table('2')
34
+ to_yaml(result)
35
+ end
36
+
37
+ def self.dvd_rank
38
+ result = get_table('3')
39
+ to_yaml(result)
40
+ end
41
+
42
+ # parse the ranktable info
43
+ def self.get_table(rankid)
44
+ doc = open_html(ATMOVIES_MAIN_URL)
45
+ table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
46
+ table = table.gsub(' : ', ':').gsub(' ', '').split
47
+ table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
48
+ table.pop
49
+ rankmix(table)
50
+ end
51
+
52
+ # mix the rank info
53
+ def self.rankmix(t)
54
+ t.each_with_index.map do |_, index|
55
+ {
56
+ index + 1 => t[index].to_s
57
+ }
58
+ end
59
+ end
60
+
61
+ # switch to different url accordingly
62
+ def self.movies(category = 'LATEST')
63
+ case category.upcase
64
+ when 'LATEST'
65
+ url = LATEST
66
+ when 'FIRST_ROUND'
67
+ url = FIRST_ROUND
68
+ when 'SECOND_ROUND'
69
+ url = SECOND_ROUND
70
+ end
71
+ result = movies_parser(url)
72
+ to_yaml(result)
73
+ end
74
+
75
+ # parse the movies acoordingly
76
+ def self.movies_parser(url)
77
+ document = open_html(url)
78
+ titles = get_titles(document)
79
+ stories = get_stories(document)
80
+ dates = get_dates(document)
81
+ trailers = get_trailer(document)
82
+ runtimes = get_runtime(document)
83
+ mix(titles, stories, dates, runtimes, trailers)
84
+ end
85
+
86
+ def self.encode_zh(text)
87
+ REFLECTION_SATITLE + URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
88
+ end
89
+
90
+ def self.get_one_movie_name(doc)
91
+ name = doc.xpath(REFLECTION_NAME).text
92
+ name.gsub!(/[\t\r\n]/, '')
93
+ end
94
+
95
+ def self.get_reflection(doc)
96
+ doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
97
+ end
98
+
99
+ # get the details of movie
100
+ def self.movie_details(code)
101
+ open_html(MOVIE_BASE_URL + code + '/')
102
+ end
103
+
104
+ # open the destination url
105
+ def self.open_html(url)
106
+ Nokogiri::HTML(open(url))
107
+ end
108
+
109
+ # get the movie name
110
+ def self.get_titles(doc)
111
+ titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
112
+ titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
113
+ end
114
+
115
+ # get the storyline of movie
116
+ def self.get_stories(doc)
117
+ storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
118
+ storylines.map(&:text) # { |story| story.text }
119
+ end
120
+
121
+ # get the runtime of movie
122
+ def self.get_runtime(doc)
123
+ days_times = split_day_and_time(doc)
124
+ days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
125
+ end
126
+
127
+ # get the release date
128
+ def self.get_dates(doc)
129
+ days_times = split_day_and_time(doc)
130
+ days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
131
+ end
132
+
133
+ def self.split_day_and_time(doc)
134
+ gap = "\n\t\t\t\t\s\s\s\s\t"
135
+ days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
136
+ days_times.map { |d_t| d_t.text.split(gap) }
137
+ end
138
+
139
+ # get the code of movies
140
+ def self.get_codes(doc)
141
+ codes = doc.xpath(WHOLE_MOVIEWS_CODES)
142
+ codes.map { |code| code.value.split('/')[2] }
143
+ end
144
+
145
+ # get the trailer link of the movies
146
+ def self.get_trailer(doc)
147
+ codes = get_codes(doc)
148
+ codes.map { |trailer| TRAILER_URL + trailer }
149
+ end
150
+
151
+ # build the hash for yaml output
152
+ def self.mix(t, s, d, ti, tr)
153
+ informations = t.each_with_index.map do |_, index|
154
+ { 'title' => t[index], 'story' => s[index], \
155
+ 'date' => d[index], 'runtime(minutes)' => ti[index], \
156
+ 'trailer' => tr[index] }
157
+ end
158
+ informations
159
+ end
160
+
161
+ # convert the schedules to yaml format
162
+ def self.to_yaml(mix)
163
+ mix.to_yaml
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,3 @@
1
+ module MovieCrawler
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,2 @@
1
+ require 'movie_crawler/crawler.rb'
2
+ require 'movie_crawler/version.rb'
@@ -0,0 +1,20 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require 'movie_crawler/version'
3
+ Gem::Specification.new do |s|
4
+ s.name = 'movie_crawler'
5
+ s.version = MovieCrawler::VERSION
6
+ s.executables << 'app'
7
+ s.date = '2014-10-25'
8
+ s.summary = 'Grab the movies information from the atmovies.com'
9
+ s.description = 'Grab the movies information from the atmovies.com'
10
+ s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
11
+ s.email = 'chung1350@hotmail.com'
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+ s.homepage = 'https://github.com/ChenLiZhan/SOA-Crawler'
15
+ s.license = 'MIT'
16
+ s.add_development_dependency 'minitest'
17
+ s.add_development_dependency 'minitest-rg'
18
+ s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
19
+ s.add_runtime_dependency 'iconv'
20
+ end
@@ -0,0 +1,39 @@
1
+ require 'minitest/autorun'
2
+ require File.expand_path('../../lib/crawler', __FILE__)
3
+
4
+ LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
5
+ SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
6
+ TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
7
+
8
+ # generate a random number to test either latest or second_round
9
+ rand < 0.5 ? url = LATEST : url = SECOND_ROUND
10
+ sample = MovieInfo.movies_parser(url)
11
+
12
+ describe 'movies_parser should involve' do
13
+
14
+ it 'must be non-empty' do
15
+ sample.wont_be_empty
16
+ end
17
+
18
+ sample.each do |each_head|
19
+ it 'title must be string' do
20
+ each_head['title'].must_be_instance_of String
21
+ end
22
+
23
+ it 'story must be string' do
24
+ each_head['story'].must_be_instance_of String
25
+ end
26
+
27
+ it 'date must match the format' do
28
+ each_head['date'].must_match %r{\d+/\d+/\d+}
29
+ end
30
+
31
+ it 'runtime must be either empty or digit numbers' do
32
+ each_head['runtime(minutes)'].must_match(/\A|\d+/)
33
+ end
34
+
35
+ it 'trailer link must accessible' do
36
+ each_head['trailer'].must_include TRAILER_URL
37
+ end
38
+ end
39
+ end
data/spec/rank_spec.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/rg'
3
+ require File.expand_path('../../lib/crawler', __FILE__)
4
+
5
+ # 1 to 3 means diffent test case related to us, taipei, dvd
6
+ rand_rank = rand(1..3)
7
+ sample = MovieInfo.get_table(rand_rank.to_s)
8
+
9
+ describe 'table should involve' do
10
+
11
+ it 'must be non-empty' do
12
+ sample.wont_be_empty
13
+ end
14
+
15
+ it 'size must be ten' do
16
+ sample.size.must_equal 10
17
+ end
18
+
19
+ sample.each do |each_movie|
20
+
21
+ it 'each_movie must be non-empty' do
22
+ each_movie.wont_be_empty
23
+ end
24
+
25
+ it 'each_key must be string' do
26
+ each_movie.each_value { |key| key.must_be_instance_of String }
27
+ end
28
+
29
+ # Hash to_s would be like "{index=>\"movie_name\"}"
30
+ it 'each_key must match the format' do
31
+ each_movie.to_s.must_match %r{\{\d+=>\"\W+\"\}}
32
+ end
33
+ end
34
+
35
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: movie_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Lee Chen
8
+ - Chen Hung Tu
9
+ - David Yang
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2014-10-25 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: minitest
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: minitest-rg
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :development
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: nokogiri
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.6.2
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 1.6.2
57
+ - !ruby/object:Gem::Dependency
58
+ name: iconv
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ description: Grab the movies information from the atmovies.com
72
+ email: chung1350@hotmail.com
73
+ executables:
74
+ - app
75
+ extensions: []
76
+ extra_rdoc_files: []
77
+ files:
78
+ - Gemfile
79
+ - README.md
80
+ - Rakefile
81
+ - bin/app
82
+ - lib/movie_crawler.rb
83
+ - lib/movie_crawler/crawler.rb
84
+ - lib/movie_crawler/version.rb
85
+ - movie_crawler.gemspec
86
+ - spec/movies_spec.rb
87
+ - spec/rank_spec.rb
88
+ homepage: https://github.com/ChenLiZhan/SOA-Crawler
89
+ licenses:
90
+ - MIT
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.4.2
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Grab the movies information from the atmovies.com
112
+ test_files:
113
+ - spec/movies_spec.rb
114
+ - spec/rank_spec.rb