movie_crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/README.md +34 -0
- data/Rakefile +8 -0
- data/bin/app +9 -0
- data/lib/movie_crawler/crawler.rb +166 -0
- data/lib/movie_crawler/version.rb +3 -0
- data/lib/movie_crawler.rb +2 -0
- data/movie_crawler.gemspec +20 -0
- data/spec/movies_spec.rb +39 -0
- data/spec/rank_spec.rb +35 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2d8c82453127b3fdba97d8ddbd5058ccb4ddc0f6
|
4
|
+
data.tar.gz: 0d5910dac4e714107db946f8f2b2768ea273c13d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 44a3bd6395d47b8ef7eae0ba323a657aef0db394700fb5175cf87590a2ed392ef5821b932a0a7df31a3229396f714576dc9870c0ee7799b40cc071326ecc0525
|
7
|
+
data.tar.gz: b9af1c5e8b28a244d0b86288b5fe0c026ad25a6264dd29fc1966db4695e75b7ef1ab4a3254bd79b2a91fcf988f318c0053408b0da730c3e54252bad2602ac4a0
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
NiceSchedule
|
2
|
+
===============
|
3
|
+
|
4
|
+
NiceSchedule tries to grabs some information on the [**Niceday**](http://plan.niceday.tw).
|
5
|
+
|
6
|
+
## About
|
7
|
+
|
8
|
+
Once you have no idea on where to go on weekend, the application will give you some inspiration.
|
9
|
+
|
10
|
+
## Usage
|
11
|
+
|
12
|
+
Copy the following on command line.
|
13
|
+
|
14
|
+
(Choose the directory you want to put it)
|
15
|
+
````
|
16
|
+
$ git clone git@github.com:ChenLiZhan/SOA-Crawler.git
|
17
|
+
````
|
18
|
+
|
19
|
+
Then type,
|
20
|
+
````
|
21
|
+
$ ruby app.rb
|
22
|
+
````
|
23
|
+
|
24
|
+
## Format
|
25
|
+
|
26
|
+
*Ex :*
|
27
|
+
|
28
|
+
**title:** 花蓮行(下雨版)
|
29
|
+
|
30
|
+
**days:** 3
|
31
|
+
|
32
|
+
**route:** 起點:陳記狀元粥鋪>松園別館>花蓮縣石雕博物館>公正包子店>一心泡泡冰>花蓮鐵道文化園區>周家蒸餃>賴桑壽司屋>時光二手書屋>林田山林業文化園區>滿妹豬腳>綠茶肉圓>瑞穗溫泉>慶修院>戴記扁食店>曾記麻糬>終點
|
33
|
+
|
34
|
+
**link:** http://plan.niceday.tw/trip/view/id/20973
|
data/Rakefile
ADDED
data/bin/app
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'movie_crawler'
|
3
|
+
|
4
|
+
puts MovieInfo.dvd_rank
|
5
|
+
puts MovieInfo.us_weekend
|
6
|
+
puts MovieInfo.taipei_weekend
|
7
|
+
puts MovieInfo.movies('LATEST') # 'first_round','second_round
|
8
|
+
puts MovieInfo.movies('FIRST_ROUND')
|
9
|
+
puts MovieInfo.movies('SECOND_ROUND')
|
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'yaml'
|
4
|
+
require 'iconv'
|
5
|
+
|
6
|
+
module MovieCrawler
|
7
|
+
# get the info from atmovies
|
8
|
+
class MovieInfo
|
9
|
+
LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
|
10
|
+
MOVIE_BASE_URL = 'http://www.atmovies.com.tw/movie/'
|
11
|
+
WHOLE_MOVIEWS_TITLES = "//div[@class = 'title']/a"
|
12
|
+
WHOLE_MOVIEWS_STORIES = "//div[@class = 'story']"
|
13
|
+
WHOLE_MOVIEWS_DATES = "//div[@class = 'date']/b"
|
14
|
+
WHOLE_MOVIEWS_CODES = "//div[@class = 'title']/a/@href"
|
15
|
+
REFLECTION_BASE = 'http://app.atmovies.com.tw/tool/good.cfm?type=film'
|
16
|
+
REFLECTION_FS = '&fs=2'
|
17
|
+
REFLECTION_CLASS = "//div[@class = 'act01']"
|
18
|
+
REFLECTION_SATITLE = '&satitle='
|
19
|
+
REFLECTION_SAID = '&said='
|
20
|
+
REFLECTION_NAME = "//span[@class = 'at21b']"
|
21
|
+
TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
|
22
|
+
FIRST_ROUND = 'http://www.atmovies.com.tw/movie/movie_now-1.html'
|
23
|
+
SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
24
|
+
ATMOVIES_MAIN_URL = 'http://www.atmovies.com.tw/home/movie_homepage.html'
|
25
|
+
|
26
|
+
# add three rank parser
|
27
|
+
def self.us_weekend
|
28
|
+
result = get_table('1')
|
29
|
+
to_yaml(result)
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.taipei_weekend
|
33
|
+
result = get_table('2')
|
34
|
+
to_yaml(result)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.dvd_rank
|
38
|
+
result = get_table('3')
|
39
|
+
to_yaml(result)
|
40
|
+
end
|
41
|
+
|
42
|
+
# parse the ranktable info
|
43
|
+
def self.get_table(rankid)
|
44
|
+
doc = open_html(ATMOVIES_MAIN_URL)
|
45
|
+
table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
|
46
|
+
table = table.gsub(' : ', ':').gsub(' ', '').split
|
47
|
+
table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
|
48
|
+
table.pop
|
49
|
+
rankmix(table)
|
50
|
+
end
|
51
|
+
|
52
|
+
# mix the rank info
|
53
|
+
def self.rankmix(t)
|
54
|
+
t.each_with_index.map do |_, index|
|
55
|
+
{
|
56
|
+
index + 1 => t[index].to_s
|
57
|
+
}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# switch to different url accordingly
|
62
|
+
def self.movies(category = 'LATEST')
|
63
|
+
case category.upcase
|
64
|
+
when 'LATEST'
|
65
|
+
url = LATEST
|
66
|
+
when 'FIRST_ROUND'
|
67
|
+
url = FIRST_ROUND
|
68
|
+
when 'SECOND_ROUND'
|
69
|
+
url = SECOND_ROUND
|
70
|
+
end
|
71
|
+
result = movies_parser(url)
|
72
|
+
to_yaml(result)
|
73
|
+
end
|
74
|
+
|
75
|
+
# parse the movies acoordingly
|
76
|
+
def self.movies_parser(url)
|
77
|
+
document = open_html(url)
|
78
|
+
titles = get_titles(document)
|
79
|
+
stories = get_stories(document)
|
80
|
+
dates = get_dates(document)
|
81
|
+
trailers = get_trailer(document)
|
82
|
+
runtimes = get_runtime(document)
|
83
|
+
mix(titles, stories, dates, runtimes, trailers)
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.encode_zh(text)
|
87
|
+
REFLECTION_SATITLE + URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.get_one_movie_name(doc)
|
91
|
+
name = doc.xpath(REFLECTION_NAME).text
|
92
|
+
name.gsub!(/[\t\r\n]/, '')
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.get_reflection(doc)
|
96
|
+
doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
|
97
|
+
end
|
98
|
+
|
99
|
+
# get the details of movie
|
100
|
+
def self.movie_details(code)
|
101
|
+
open_html(MOVIE_BASE_URL + code + '/')
|
102
|
+
end
|
103
|
+
|
104
|
+
# open the destination url
|
105
|
+
def self.open_html(url)
|
106
|
+
Nokogiri::HTML(open(url))
|
107
|
+
end
|
108
|
+
|
109
|
+
# get the movie name
|
110
|
+
def self.get_titles(doc)
|
111
|
+
titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
|
112
|
+
titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
|
113
|
+
end
|
114
|
+
|
115
|
+
# get the storyline of movie
|
116
|
+
def self.get_stories(doc)
|
117
|
+
storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
|
118
|
+
storylines.map(&:text) # { |story| story.text }
|
119
|
+
end
|
120
|
+
|
121
|
+
# get the runtime of movie
|
122
|
+
def self.get_runtime(doc)
|
123
|
+
days_times = split_day_and_time(doc)
|
124
|
+
days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
|
125
|
+
end
|
126
|
+
|
127
|
+
# get the release date
|
128
|
+
def self.get_dates(doc)
|
129
|
+
days_times = split_day_and_time(doc)
|
130
|
+
days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.split_day_and_time(doc)
|
134
|
+
gap = "\n\t\t\t\t\s\s\s\s\t"
|
135
|
+
days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
|
136
|
+
days_times.map { |d_t| d_t.text.split(gap) }
|
137
|
+
end
|
138
|
+
|
139
|
+
# get the code of movies
|
140
|
+
def self.get_codes(doc)
|
141
|
+
codes = doc.xpath(WHOLE_MOVIEWS_CODES)
|
142
|
+
codes.map { |code| code.value.split('/')[2] }
|
143
|
+
end
|
144
|
+
|
145
|
+
# get the trailer link of the movies
|
146
|
+
def self.get_trailer(doc)
|
147
|
+
codes = get_codes(doc)
|
148
|
+
codes.map { |trailer| TRAILER_URL + trailer }
|
149
|
+
end
|
150
|
+
|
151
|
+
# build the hash for yaml output
|
152
|
+
def self.mix(t, s, d, ti, tr)
|
153
|
+
informations = t.each_with_index.map do |_, index|
|
154
|
+
{ 'title' => t[index], 'story' => s[index], \
|
155
|
+
'date' => d[index], 'runtime(minutes)' => ti[index], \
|
156
|
+
'trailer' => tr[index] }
|
157
|
+
end
|
158
|
+
informations
|
159
|
+
end
|
160
|
+
|
161
|
+
# convert the schedules to yaml format
|
162
|
+
def self.to_yaml(mix)
|
163
|
+
mix.to_yaml
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require 'movie_crawler/version'
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'movie_crawler'
|
5
|
+
s.version = MovieCrawler::VERSION
|
6
|
+
s.executables << 'app'
|
7
|
+
s.date = '2014-10-25'
|
8
|
+
s.summary = 'Grab the movies information from the atmovies.com'
|
9
|
+
s.description = 'Grab the movies information from the atmovies.com'
|
10
|
+
s.authors = ['Lee Chen', 'Chen Hung Tu', 'David Yang']
|
11
|
+
s.email = 'chung1350@hotmail.com'
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
s.homepage = 'https://github.com/ChenLiZhan/SOA-Crawler'
|
15
|
+
s.license = 'MIT'
|
16
|
+
s.add_development_dependency 'minitest'
|
17
|
+
s.add_development_dependency 'minitest-rg'
|
18
|
+
s.add_runtime_dependency 'nokogiri', '>= 1.6.2' # v.1.6.2 has documented problems
|
19
|
+
s.add_runtime_dependency 'iconv'
|
20
|
+
end
|
data/spec/movies_spec.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require File.expand_path('../../lib/crawler', __FILE__)
|
3
|
+
|
4
|
+
LATEST = 'http://www.atmovies.com.tw/movie/movie_new.html'
|
5
|
+
SECOND_ROUND = 'http://www.atmovies.com.tw/movie/movie_now2-1.html'
|
6
|
+
TRAILER_URL = 'http://app.atmovies.com.tw/movie/movie.cfm?action=trailer&film_id='
|
7
|
+
|
8
|
+
# generate a random number to test either latest or second_round
|
9
|
+
rand < 0.5 ? url = LATEST : url = SECOND_ROUND
|
10
|
+
sample = MovieInfo.movies_parser(url)
|
11
|
+
|
12
|
+
describe 'movies_parser should involve' do
|
13
|
+
|
14
|
+
it 'must be non-empty' do
|
15
|
+
sample.wont_be_empty
|
16
|
+
end
|
17
|
+
|
18
|
+
sample.each do |each_head|
|
19
|
+
it 'title must be string' do
|
20
|
+
each_head['title'].must_be_instance_of String
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'story must be string' do
|
24
|
+
each_head['story'].must_be_instance_of String
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'date must match the format' do
|
28
|
+
each_head['date'].must_match %r{\d+/\d+/\d+}
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'runtime must be either empty or digit numbers' do
|
32
|
+
each_head['runtime(minutes)'].must_match(/\A|\d+/)
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'trailer link must accessible' do
|
36
|
+
each_head['trailer'].must_include TRAILER_URL
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/spec/rank_spec.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'minitest/rg'
|
3
|
+
require File.expand_path('../../lib/crawler', __FILE__)
|
4
|
+
|
5
|
+
# 1 to 3 means diffent test case related to us, taipei, dvd
|
6
|
+
rand_rank = rand(1..3)
|
7
|
+
sample = MovieInfo.get_table(rand_rank.to_s)
|
8
|
+
|
9
|
+
describe 'table should involve' do
|
10
|
+
|
11
|
+
it 'must be non-empty' do
|
12
|
+
sample.wont_be_empty
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'size must be ten' do
|
16
|
+
sample.size.must_equal 10
|
17
|
+
end
|
18
|
+
|
19
|
+
sample.each do |each_movie|
|
20
|
+
|
21
|
+
it 'each_movie must be non-empty' do
|
22
|
+
each_movie.wont_be_empty
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'each_key must be string' do
|
26
|
+
each_movie.each_value { |key| key.must_be_instance_of String }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Hash to_s would be like "{index=>\"movie_name\"}"
|
30
|
+
it 'each_key must match the format' do
|
31
|
+
each_movie.to_s.must_match %r{\{\d+=>\"\W+\"\}}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: movie_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Lee Chen
|
8
|
+
- Chen Hung Tu
|
9
|
+
- David Yang
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2014-10-25 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: minitest
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
requirements:
|
19
|
+
- - ">="
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: '0'
|
29
|
+
- !ruby/object:Gem::Dependency
|
30
|
+
name: minitest-rg
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
type: :development
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: nokogiri
|
45
|
+
requirement: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.6.2
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 1.6.2
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: iconv
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
type: :runtime
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
description: Grab the movies information from the atmovies.com
|
72
|
+
email: chung1350@hotmail.com
|
73
|
+
executables:
|
74
|
+
- app
|
75
|
+
extensions: []
|
76
|
+
extra_rdoc_files: []
|
77
|
+
files:
|
78
|
+
- Gemfile
|
79
|
+
- README.md
|
80
|
+
- Rakefile
|
81
|
+
- bin/app
|
82
|
+
- lib/movie_crawler.rb
|
83
|
+
- lib/movie_crawler/crawler.rb
|
84
|
+
- lib/movie_crawler/version.rb
|
85
|
+
- movie_crawler.gemspec
|
86
|
+
- spec/movies_spec.rb
|
87
|
+
- spec/rank_spec.rb
|
88
|
+
homepage: https://github.com/ChenLiZhan/SOA-Crawler
|
89
|
+
licenses:
|
90
|
+
- MIT
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.4.2
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: Grab the movies information from the atmovies.com
|
112
|
+
test_files:
|
113
|
+
- spec/movies_spec.rb
|
114
|
+
- spec/rank_spec.rb
|