movie_crawler 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +34 -0
- data/Rakefile +8 -0
- data/bin/app +9 -0
- data/lib/movie_crawler/crawler.rb +166 -0
- data/lib/movie_crawler/version.rb +3 -0
- data/lib/movie_crawler.rb +2 -0
- data/movie_crawler.gemspec +20 -0
- data/spec/movies_spec.rb +39 -0
- data/spec/rank_spec.rb +35 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2d8c82453127b3fdba97d8ddbd5058ccb4ddc0f6
|
4
|
+
data.tar.gz: 0d5910dac4e714107db946f8f2b2768ea273c13d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 44a3bd6395d47b8ef7eae0ba323a657aef0db394700fb5175cf87590a2ed392ef5821b932a0a7df31a3229396f714576dc9870c0ee7799b40cc071326ecc0525
|
7
|
+
data.tar.gz: b9af1c5e8b28a244d0b86288b5fe0c026ad25a6264dd29fc1966db4695e75b7ef1ab4a3254bd79b2a91fcf988f318c0053408b0da730c3e54252bad2602ac4a0
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
NiceSchedule
|
2
|
+
===============
|
3
|
+
|
4
|
+
NiceSchedule tries to grabs some information on the [**Niceday**](http://plan.niceday.tw).
|
5
|
+
|
6
|
+
## About
|
7
|
+
|
8
|
+
Once you have no idea on where to go on weekend, the application will give you some inspiration.
|
9
|
+
|
10
|
+
## Usage
|
11
|
+
|
12
|
+
Copy the following on command line.
|
13
|
+
|
14
|
+
(Choose the directory you want to put it)
|
15
|
+
````
|
16
|
+
$ git clone git@github.com:ChenLiZhan/SOA-Crawler.git
|
17
|
+
````
|
18
|
+
|
19
|
+
Then type,
|
20
|
+
````
|
21
|
+
$ ruby app.rb
|
22
|
+
````
|
23
|
+
|
24
|
+
## Format
|
25
|
+
|
26
|
+
*Ex :*
|
27
|
+
|
28
|
+
**title:** 花蓮行(下雨版)
|
29
|
+
|
30
|
+
**days:** 3
|
31
|
+
|
32
|
+
**route:** 起點:陳記狀元粥鋪>松園別館>花蓮縣石雕博物館>公正包子店>一心泡泡冰>花蓮鐵道文化園區>周家蒸餃>賴桑壽司屋>時光二手書屋>林田山林業文化園區>滿妹豬腳>綠茶肉圓>瑞穗溫泉>慶修院>戴記扁食店>曾記麻糬>終點
|
33
|
+
|
34
|
+
**link:** http://plan.niceday.tw/trip/view/id/20973
|
data/Rakefile
ADDED
data/bin/app
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'movie_crawler'
|
3
|
+
|
4
|
+
puts MovieInfo.dvd_rank
|
5
|
+
puts MovieInfo.us_weekend
|
6
|
+
puts MovieInfo.taipei_weekend
|
7
|
+
puts MovieInfo.movies('LATEST') # 'first_round','second_round
|
8
|
+
puts MovieInfo.movies('FIRST_ROUND')
|
9
|
+
puts MovieInfo.movies('SECOND_ROUND')
|
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'yaml'
|
4
|
+
require 'iconv'
|
5
|
+
|
6
|
+
module MovieCrawler
|
7
|
+
# get the info from atmovies
|
8
|
+
class MovieInfo
|
9
|
+
LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
|
10
|
+
MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
|
11
|
+
WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
|
12
|
+
WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
|
13
|
+
WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
|
14
|
+
WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
|
15
|
+
REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
|
16
|
+
REFLECTION_FS = '&fs=2'
|
17
|
+
REFLECTION_CLASS = "//div[@class = 'act01']"
|
18
|
+
REFLECTION_SATITLE = '&satitle='
|
19
|
+
REFLECTION_SAID = '&said='
|
20
|
+
REFLECTION_NAME = "//span[@class = 'at21b']"
|
21
|
+
TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
|
22
|
+
FIRST_ROUND = 'http://www.atmovies.com.tw/movie/movie_now-1.html'
|
23
|
+
SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
24
|
+
ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
|
25
|
+
|
26
|
+
# add three rank parser
|
27
|
+
def self.us_weekend
|
28
|
+
result = get_table('1')
|
29
|
+
to_yaml(result)
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.taipei_weekend
|
33
|
+
result = get_table('2')
|
34
|
+
to_yaml(result)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.dvd_rank
|
38
|
+
result = get_table('3')
|
39
|
+
to_yaml(result)
|
40
|
+
end
|
41
|
+
|
42
|
+
# parse the ranktable info
|
43
|
+
def self.get_table(rankid)
|
44
|
+
doc = open_html(ATMOVIES_MAIN_URL)
|
45
|
+
table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
|
46
|
+
table = table.gsub(' : ', ':').gsub(' ', '').split
|
47
|
+
table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
|
48
|
+
table.pop
|
49
|
+
rankmix(table)
|
50
|
+
end
|
51
|
+
|
52
|
+
# mix the rank info
|
53
|
+
def self.rankmix(t)
|
54
|
+
t.each_with_index.map do |_, index|
|
55
|
+
{
|
56
|
+
index + 1 => t[index].to_s
|
57
|
+
}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# switch to different url accordingly
|
62
|
+
def self.movies(category = 'LATEST')
|
63
|
+
case category.upcase
|
64
|
+
when 'LATEST'
|
65
|
+
url = LATEST
|
66
|
+
when 'FIRST_ROUND'
|
67
|
+
url = FIRST_ROUND
|
68
|
+
when 'SECOND_ROUND'
|
69
|
+
url = SECOND_ROUND
|
70
|
+
end
|
71
|
+
result = movies_parser(url)
|
72
|
+
to_yaml(result)
|
73
|
+
end
|
74
|
+
|
75
|
+
# parse the movies acoordingly
|
76
|
+
def self.movies_parser(url)
|
77
|
+
document = open_html(url)
|
78
|
+
titles = get_titles(document)
|
79
|
+
stories = get_stories(document)
|
80
|
+
dates = get_dates(document)
|
81
|
+
trailers = get_trailer(document)
|
82
|
+
runtimes = get_runtime(document)
|
83
|
+
mix(titles, stories, dates, runtimes, trailers)
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.encode_zh(text)
|
87
|
+
REFLECTION_SATITLE + URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.get_one_movie_name(doc)
|
91
|
+
name = doc.xpath(REFLECTION_NAME).text
|
92
|
+
name.gsub!(/[\t\r\n]/, '')
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.get_reflection(doc)
|
96
|
+
doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
|
97
|
+
end
|
98
|
+
|
99
|
+
# get the details of movie
|
100
|
+
def self.movie_details(code)
|
101
|
+
open_html(MOVIE_BASE_URL + code + '/')
|
102
|
+
end
|
103
|
+
|
104
|
+
# open the destination url
|
105
|
+
def self.open_html(url)
|
106
|
+
Nokogiri::HTML(open(url))
|
107
|
+
end
|
108
|
+
|
109
|
+
# get the movie name
|
110
|
+
def self.get_titles(doc)
|
111
|
+
titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
|
112
|
+
titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
|
113
|
+
end
|
114
|
+
|
115
|
+
# get the storyline of movie
|
116
|
+
def self.get_stories(doc)
|
117
|
+
storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
|
118
|
+
storylines.map(&:text) # { |story| story.text }
|
119
|
+
end
|
120
|
+
|
121
|
+
# get the runtime of movie
|
122
|
+
def self.get_runtime(doc)
|
123
|
+
days_times = split_day_and_time(doc)
|
124
|
+
days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
|
125
|
+
end
|
126
|
+
|
127
|
+
# get the release date
|
128
|
+
def self.get_dates(doc)
|
129
|
+
days_times = split_day_and_time(doc)
|
130
|
+
days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.split_day_and_time(doc)
|
134
|
+
gap = "\n\t\t\t\t\s\s\s\s\t"
|
135
|
+
days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
|
136
|
+
days_times.map { |d_t| d_t.text.split(gap) }
|
137
|
+
end
|
138
|
+
|
139
|
+
# get the code of movies
|
140
|
+
def self.get_codes(doc)
|
141
|
+
codes = doc.xpath(WHOLE_MOVIEWS_CODES)
|
142
|
+
codes.map { |code| code.value.split('/')[2] }
|
143
|
+
end
|
144
|
+
|
145
|
+
# get the trailer link of the movies
|
146
|
+
def self.get_trailer(doc)
|
147
|
+
codes = get_codes(doc)
|
148
|
+
codes.map { |trailer| TRAILER_URL + trailer }
|
149
|
+
end
|
150
|
+
|
151
|
+
# build the hash for yaml output
|
152
|
+
def self.mix(t, s, d, ti, tr)
|
153
|
+
informations = t.each_with_index.map do |_, index|
|
154
|
+
{ 'title' => t[index], 'story' => s[index], \
|
155
|
+
'date' => d[index], 'runtime(minutes)' => ti[index], \
|
156
|
+
'trailer' => tr[index] }
|
157
|
+
end
|
158
|
+
informations
|
159
|
+
end
|
160
|
+
|
161
|
+
# convert the schedules to yaml format
|
162
|
+
def self.to_yaml(mix)
|
163
|
+
mix.to_yaml
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require 'movie_crawler/version'
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'movie_crawler'
|
5
|
+
s.version = MovieCrawler::VERSION
|
6
|
+
s.executables << 'app'
|
7
|
+
s.date = '2014-10-25'
|
8
|
+
s.summary = 'Grab the movies information from the atmovies.com'
|
9
|
+
s.description = 'Grab the movies information from the atmovies.com'
|
10
|
+
s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
|
11
|
+
s.email = 'chung1350@hotmail.com'
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
s.homepage = 'https://github.com/ChenLiZhan/SOA-Crawler'
|
15
|
+
s.license = 'MIT'
|
16
|
+
s.add_development_dependency 'minitest'
|
17
|
+
s.add_development_dependency 'minitest-rg'
|
18
|
+
s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
|
19
|
+
s.add_runtime_dependency 'iconv'
|
20
|
+
end
|
data/spec/movies_spec.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require File.expand_path('../../lib/crawler', __FILE__)
|
3
|
+
|
4
|
+
LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
|
5
|
+
SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
6
|
+
TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
|
7
|
+
|
8
|
+
# generate a random number to test either latest or second_round
|
9
|
+
rand < 0.5 ? url = LATEST : url = SECOND_ROUND
|
10
|
+
sample = MovieInfo.movies_parser(url)
|
11
|
+
|
12
|
+
describe 'movies_parser should involve' do
|
13
|
+
|
14
|
+
it 'must be non-empty' do
|
15
|
+
sample.wont_be_empty
|
16
|
+
end
|
17
|
+
|
18
|
+
sample.each do |each_head|
|
19
|
+
it 'title must be string' do
|
20
|
+
each_head['title'].must_be_instance_of String
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'story must be string' do
|
24
|
+
each_head['story'].must_be_instance_of String
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'date must match the format' do
|
28
|
+
each_head['date'].must_match %r{\d+/\d+/\d+}
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'runtime must be either empty or digit numbers' do
|
32
|
+
each_head['runtime(minutes)'].must_match(/\A|\d+/)
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'trailer link must accessible' do
|
36
|
+
each_head['trailer'].must_include TRAILER_URL
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/spec/rank_spec.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'minitest/rg'
|
3
|
+
require File.expand_path('../../lib/crawler', __FILE__)
|
4
|
+
|
5
|
+
# 1 to 3 means diffent test case related to us, taipei, dvd
|
6
|
+
rand_rank = rand(1..3)
|
7
|
+
sample = MovieInfo.get_table(rand_rank.to_s)
|
8
|
+
|
9
|
+
describe 'table should involve' do
|
10
|
+
|
11
|
+
it 'must be non-empty' do
|
12
|
+
sample.wont_be_empty
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'size must be ten' do
|
16
|
+
sample.size.must_equal 10
|
17
|
+
end
|
18
|
+
|
19
|
+
sample.each do |each_movie|
|
20
|
+
|
21
|
+
it 'each_movie must be non-empty' do
|
22
|
+
each_movie.wont_be_empty
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'each_key must be string' do
|
26
|
+
each_movie.each_value { |key| key.must_be_instance_of String }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Hash to_s would be like "{index=>\"movie_name\"}"
|
30
|
+
it 'each_key must match the format' do
|
31
|
+
each_movie.to_s.must_match %r{\{\d+=>\"\W+\"\}}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: movie_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Lee Chen
|
8
|
+
- Chen Hung Tu
|
9
|
+
- David Yang
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2014-10-25 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: minitest
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
requirements:
|
19
|
+
- - ">="
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: '0'
|
29
|
+
- !ruby/object:Gem::Dependency
|
30
|
+
name: minitest-rg
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
type: :development
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: nokogiri
|
45
|
+
requirement: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.6.2
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 1.6.2
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: iconv
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
type: :runtime
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
description: Grab the movies information from the atmovies.com
|
72
|
+
email: chung1350@hotmail.com
|
73
|
+
executables:
|
74
|
+
- app
|
75
|
+
extensions: []
|
76
|
+
extra_rdoc_files: []
|
77
|
+
files:
|
78
|
+
- Gemfile
|
79
|
+
- README.md
|
80
|
+
- Rakefile
|
81
|
+
- bin/app
|
82
|
+
- lib/movie_crawler.rb
|
83
|
+
- lib/movie_crawler/crawler.rb
|
84
|
+
- lib/movie_crawler/version.rb
|
85
|
+
- movie_crawler.gemspec
|
86
|
+
- spec/movies_spec.rb
|
87
|
+
- spec/rank_spec.rb
|
88
|
+
homepage: https://github.com/ChenLiZhan/SOA-Crawler
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.4.2
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: Grab the movies information from the atmovies.com
|
112
|
+
test_files:
|
113
|
+
- spec/movies_spec.rb
|
114
|
+
- spec/rank_spec.rb
|