google_showtimes 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +24 -0
- data/.project +17 -0
- data/LICENSE +20 -0
- data/README.rdoc +20 -0
- data/Rakefile +64 -0
- data/VERSION +1 -0
- data/google_showtimes.gemspec +60 -0
- data/lib/google_showtimes.rb +285 -0
- data/test/fixtures/cinemas_movies.html +5 -0
- data/test/fixtures/cinemas_movies_uk.html +5 -0
- data/test/fixtures/movie2_cinemas.html +7 -0
- data/test/fixtures/movie_cinemas.html +26 -0
- data/test/helper.rb +9 -0
- data/test/test_google_showtimes.rb +130 -0
- metadata +80 -0
data/.document
ADDED
data/.gitignore
ADDED
data/.project
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<projectDescription>
|
3
|
+
<name>google_showtimes</name>
|
4
|
+
<comment></comment>
|
5
|
+
<projects>
|
6
|
+
</projects>
|
7
|
+
<buildSpec>
|
8
|
+
<buildCommand>
|
9
|
+
<name>org.rubypeople.rdt.core.rubybuilder</name>
|
10
|
+
<arguments>
|
11
|
+
</arguments>
|
12
|
+
</buildCommand>
|
13
|
+
</buildSpec>
|
14
|
+
<natures>
|
15
|
+
<nature>org.rubypeople.rdt.core.rubynature</nature>
|
16
|
+
</natures>
|
17
|
+
</projectDescription>
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Victor Costan
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
= google_showtimes
|
2
|
+
|
3
|
+
This gem retrieves movie showtimes from Google, by scraping their HTML results.
|
4
|
+
|
5
|
+
Note: the code was written a long time ago and is quite ugly. On the bright
|
6
|
+
side, I already wrote it, so it comes for free.
|
7
|
+
|
8
|
+
== Note on Patches/Pull Requests
|
9
|
+
|
10
|
+
* Fork the project.
|
11
|
+
* Make your feature addition or bug fix.
|
12
|
+
* Add tests for it. This is important so I don't break it in a
|
13
|
+
future version unintentionally.
|
14
|
+
* Commit, do not mess with rakefile, version, or history.
|
15
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
16
|
+
* Send me a pull request. Bonus points for topic branches.
|
17
|
+
|
18
|
+
== Copyright
|
19
|
+
|
20
|
+
Copyright (c) 2009 Victor Costan. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Rakefile for the google_showtimes gem.
|
2
|
+
#
|
3
|
+
# Author:: Victor Costan
|
4
|
+
# Copyright:: Copyright (C) 2009 Victor Costan
|
5
|
+
# License:: MIT
|
6
|
+
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rake'
|
9
|
+
|
10
|
+
begin
|
11
|
+
require 'jeweler'
|
12
|
+
Jeweler::Tasks.new do |gem|
|
13
|
+
gem.name = "google_showtimes"
|
14
|
+
gem.summary = %Q{Movie showtimes from Google}
|
15
|
+
gem.description = %Q{Library for scraping Google's showtimes search.}
|
16
|
+
gem.email = "costan@gmail.com"
|
17
|
+
gem.homepage = "http://github.com/costan/google_showtimes"
|
18
|
+
gem.authors = ["Victor Costan"]
|
19
|
+
gem.add_dependency 'nokogiri', '>= 1.4.1'
|
20
|
+
gem.rubyforge_project = 'zerglings'
|
21
|
+
|
22
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
23
|
+
end
|
24
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
25
|
+
rubyforge.doc_task = "rdoc"
|
26
|
+
end
|
27
|
+
|
28
|
+
rescue LoadError
|
29
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'rake/testtask'
|
33
|
+
Rake::TestTask.new(:test) do |test|
|
34
|
+
test.libs << 'lib' << 'test'
|
35
|
+
test.pattern = 'test/**/test_*.rb'
|
36
|
+
test.verbose = true
|
37
|
+
end
|
38
|
+
|
39
|
+
begin
|
40
|
+
require 'rcov/rcovtask'
|
41
|
+
Rcov::RcovTask.new do |test|
|
42
|
+
test.libs << 'test'
|
43
|
+
test.pattern = 'test/**/test_*.rb'
|
44
|
+
test.verbose = true
|
45
|
+
end
|
46
|
+
rescue LoadError
|
47
|
+
task :rcov do
|
48
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task :test => :check_dependencies
|
53
|
+
|
54
|
+
task :default => :test
|
55
|
+
|
56
|
+
require 'rake/rdoctask'
|
57
|
+
Rake::RDocTask.new do |rdoc|
|
58
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
59
|
+
|
60
|
+
rdoc.rdoc_dir = 'rdoc'
|
61
|
+
rdoc.title = "google_showtimes #{version}"
|
62
|
+
rdoc.rdoc_files.include('README*')
|
63
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
64
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.2
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{google_showtimes}
|
8
|
+
s.version = "1.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Victor Costan"]
|
12
|
+
s.date = %q{2009-12-24}
|
13
|
+
s.description = %q{Library for scraping Google's showtimes search.}
|
14
|
+
s.email = %q{costan@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
".project",
|
23
|
+
"LICENSE",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"google_showtimes.gemspec",
|
28
|
+
"lib/google_showtimes.rb",
|
29
|
+
"test/fixtures/cinemas_movies.html",
|
30
|
+
"test/fixtures/cinemas_movies_uk.html",
|
31
|
+
"test/fixtures/movie2_cinemas.html",
|
32
|
+
"test/fixtures/movie_cinemas.html",
|
33
|
+
"test/helper.rb",
|
34
|
+
"test/test_google_showtimes.rb"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/costan/google_showtimes}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubyforge_project = %q{zerglings}
|
40
|
+
s.rubygems_version = %q{1.3.5}
|
41
|
+
s.summary = %q{Movie showtimes from Google}
|
42
|
+
s.test_files = [
|
43
|
+
"test/test_google_showtimes.rb",
|
44
|
+
"test/helper.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
55
|
+
end
|
56
|
+
else
|
57
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
@@ -0,0 +1,285 @@
|
|
1
|
+
# Scraper for Google showtime search.
|
2
|
+
#
|
3
|
+
# Author:: Victor Costan
|
4
|
+
# Copyright:: Copyright (C) 2009 Victor Costan
|
5
|
+
# License:: MIT
|
6
|
+
|
7
|
+
require 'net/http'
|
8
|
+
require 'set'
|
9
|
+
require 'uri'
|
10
|
+
|
11
|
+
require 'nokogiri'
|
12
|
+
|
13
|
+
# Scraper for Google's movie showtimes search.
|
14
|
+
#
|
15
|
+
# The #for method is the only method intended to be used by client code. See its
|
16
|
+
# documentation to get started.
|
17
|
+
module GoogleShowtimes
|
18
|
+
# Searches Google (google.com/movies) for movie showtimes.
|
19
|
+
#
|
20
|
+
# Args:
|
21
|
+
# movie:: the name of the movies
|
22
|
+
# if nil, will retrieve all the showtimes at the given location
|
23
|
+
# location:: a string containing the location to search for
|
24
|
+
# Google is awesome at geocoding, so throw in zipcodes,
|
25
|
+
# addresses, cities, or hoods
|
26
|
+
#
|
27
|
+
# Returns a string containing the Google-disambiguated location, and an array
|
28
|
+
# of hashes. One hash has showtimes for a film at a cinema and looks like
|
29
|
+
# this:
|
30
|
+
# { :cinema => { :name => 'AMC 13', :address => '1998 Broadway, ....' },
|
31
|
+
# :film => { :name => 'Dark Knight', :imdb => '0123456' },
|
32
|
+
# :showtimes => [ { :time => '11:30am' },
|
33
|
+
# { :time => '1:00', :href => 'site selling tickets' } ]
|
34
|
+
# }
|
35
|
+
def self.for(location, movie = nil)
|
36
|
+
query = if movie
|
37
|
+
"/movies?q=#{URI.encode(movie)}&near=#{URI.encode(location)}"
|
38
|
+
else
|
39
|
+
"/movies?near=#{URI.encode(location)}"
|
40
|
+
end
|
41
|
+
|
42
|
+
results = []
|
43
|
+
google_location = nil
|
44
|
+
while query
|
45
|
+
response = Net::HTTP.start("google.com", 80) { |http| http.get query }
|
46
|
+
unless response.kind_of? Net::HTTPSuccess
|
47
|
+
return nil
|
48
|
+
end
|
49
|
+
|
50
|
+
partial_results, location, query = parse_results response.body
|
51
|
+
google_location ||= location
|
52
|
+
results += partial_results
|
53
|
+
end
|
54
|
+
return google_location, results
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parses a Google showtimes results page.
|
58
|
+
#
|
59
|
+
# Args:
|
60
|
+
# nokogiri:: a Nokogiri document for the Google showtimes results page
|
61
|
+
#
|
62
|
+
# Returns an array of results, a string containing the Google-disambiguated
|
63
|
+
# location, and a string containing the URL for the 'Next >' link. The first
|
64
|
+
# two return values are structured like return of the #for method. The last
|
65
|
+
# string may be nil if the results page contains no 'Next >' link.
|
66
|
+
def self.parse_results(page_contents)
|
67
|
+
nokogiri = Nokogiri::HTML page_contents
|
68
|
+
|
69
|
+
location = parse_location nokogiri
|
70
|
+
next_url = parse_next_link nokogiri
|
71
|
+
results = []
|
72
|
+
|
73
|
+
theater, movie = nil, nil
|
74
|
+
parse_results_fast nokogiri do |info_type, info|
|
75
|
+
case info_type
|
76
|
+
when :movie
|
77
|
+
movie = info
|
78
|
+
when :theater
|
79
|
+
theater = info
|
80
|
+
when :times
|
81
|
+
results << { :film => movie, :cinema => theater, :showtimes => info }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
return results, location, next_url
|
85
|
+
end
|
86
|
+
|
87
|
+
# Parses a Google showtimes results page.
|
88
|
+
#
|
89
|
+
# This method uses a fast parsing method, assuming the well-behaved output
|
90
|
+
# page produced by Google at the time of the gem's writing.
|
91
|
+
#
|
92
|
+
# Args:
|
93
|
+
# nokogiri:: a Nokogiri document for the Google showtimes results page
|
94
|
+
#
|
95
|
+
# Yields a symbol and information hash for every piece of information found.
|
96
|
+
# The symbol is either +:theater+, +:movie+, or +:times+. The information is
|
97
|
+
# the same as in the #for method.
|
98
|
+
def self.parse_results_fast(nokogiri, &block)
|
99
|
+
query = '//div[@class="movie" or @class="theater" or @class="times"]'
|
100
|
+
nokogiri.xpath(query).each do |div|
|
101
|
+
case div['class']
|
102
|
+
when 'theater'
|
103
|
+
if info = parse_theater_fast(div)
|
104
|
+
yield :theater, info
|
105
|
+
end
|
106
|
+
when 'movie'
|
107
|
+
if info = parse_movie_fast(div)
|
108
|
+
yield :movie, info
|
109
|
+
end
|
110
|
+
when 'times'
|
111
|
+
if info = parse_showing_times(div)
|
112
|
+
yield :times, info
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Parses movie theater information in a Google showtime results page.
|
119
|
+
#
|
120
|
+
# This method uses a fast parsing method, assuming the well-behaved output
|
121
|
+
# page produced by Google at the time of the gem's writing.
|
122
|
+
#
|
123
|
+
# Args:
|
124
|
+
# nokogiri:: a Nokogiri node containing the movie theater data
|
125
|
+
#
|
126
|
+
# Returns a hash with the keys +:name+ and +:address+, or nil if parsing
|
127
|
+
# failed.
|
128
|
+
def self.parse_theater_fast(nokogiri)
|
129
|
+
name_elem = nokogiri.css('.name').first
|
130
|
+
address_elem = nokogiri.css('.address').first || nokogiri.css('.info').first
|
131
|
+
if name_elem && address_elem
|
132
|
+
address, phone = address_phone(address_elem.text)
|
133
|
+
info = { :name => name_elem.text, :address => address }
|
134
|
+
info[:phone] = phone if phone
|
135
|
+
return info
|
136
|
+
end
|
137
|
+
nil
|
138
|
+
end
|
139
|
+
|
140
|
+
# Attempts to extract the phone number from an address+phone string.
|
141
|
+
#
|
142
|
+
# Args:
|
143
|
+
# text:: Google showtimes string containing an address and phone number
|
144
|
+
#
|
145
|
+
# Returns two strings containing the address, and the phone number. The phone
|
146
|
+
# number will be nil if the strings could not be separated.
|
147
|
+
#
|
148
|
+
# Example:
|
149
|
+
# a, p = address_phone('234 West 42nd St., New York - (212) 398-3939')
|
150
|
+
def self.address_phone(text)
|
151
|
+
# The biggest suffix that consists of non-word characters.
|
152
|
+
# HACK: One x is allowed, for extension: (800) 326-3264 x771
|
153
|
+
ph_number = text.scan(
|
154
|
+
/[[:digit:][:punct:][:space:]]+(?:x[[:digit:][:punct:][:space:]]+)?$/u).
|
155
|
+
sort_by(&:length).last
|
156
|
+
return text, nil unless ph_number
|
157
|
+
|
158
|
+
address = text[0, text.length - ph_number.length]
|
159
|
+
ph_number.gsub! /^\s*\-\s*/, ''
|
160
|
+
|
161
|
+
# If it has 50% digits, it's good.
|
162
|
+
digit_count = ph_number.scan(/\d/u).length
|
163
|
+
return text, nil unless digit_count * 2 >= ph_number.length
|
164
|
+
return address, ph_number
|
165
|
+
end
|
166
|
+
|
167
|
+
# Parses movie information in a Google showtime results page.
|
168
|
+
#
|
169
|
+
# This method uses a fast parsing method, assuming the well-behaved output
|
170
|
+
# page produced by Google at the time of the gem's writing.
|
171
|
+
#
|
172
|
+
# Args:
|
173
|
+
# nokogiri:: a Nokogiri node containing the movie data
|
174
|
+
#
|
175
|
+
# Returns a hash with the keys +:name+ and +:imdb+, or nil if parsing failed.
|
176
|
+
def self.parse_movie_fast(nokogiri)
|
177
|
+
name_elem = nokogiri.css('div.desc h2').first
|
178
|
+
name_elem ||= nokogiri.css('.name').first
|
179
|
+
|
180
|
+
imdb = nil
|
181
|
+
nokogiri.css('a').each do |a|
|
182
|
+
match_data = /imdb\.com\/title\/tt(\d*)\//.match a['href']
|
183
|
+
next unless match_data
|
184
|
+
|
185
|
+
imdb = match_data[1]
|
186
|
+
return { :name => name_elem.text, :imdb => imdb }
|
187
|
+
end
|
188
|
+
nil
|
189
|
+
end
|
190
|
+
|
191
|
+
# Parses showing times information in a Google showtime results page.
|
192
|
+
#
|
193
|
+
# Args:
|
194
|
+
# nokogiri:: a Nokogiri node containing the showing times data
|
195
|
+
#
|
196
|
+
# Returns a hash with the keys +:time+ and (optionally) +:href+, or nil if
|
197
|
+
# parsing failed.
|
198
|
+
def self.parse_showing_times(nokogiri)
|
199
|
+
times = []
|
200
|
+
time_set = Set.new
|
201
|
+
|
202
|
+
# Parse times with ticket buying links.
|
203
|
+
nokogiri.css('a').each do |a|
|
204
|
+
next unless /\d\:\d\d/ =~ a.text
|
205
|
+
time_set << a.text
|
206
|
+
times << { :time => a.text, :href => cleanup_redirects(a['href']) }
|
207
|
+
end
|
208
|
+
|
209
|
+
# Parse plaintext times.
|
210
|
+
nokogiri.text.split.each do |time_text|
|
211
|
+
time_text.gsub!(/[^\d\:amp]/, '')
|
212
|
+
next unless /\d\:\d\d/ =~ time_text
|
213
|
+
next if time_set.include? time_text
|
214
|
+
times << { :time => time_text }
|
215
|
+
end
|
216
|
+
|
217
|
+
# Parse text-form time into Time objects.
|
218
|
+
last_suffix = ''
|
219
|
+
(times.length - 1).downto(0) do |index|
|
220
|
+
time = times[index][:time]
|
221
|
+
|
222
|
+
if ['am', 'pm'].include? time[-2, 2]
|
223
|
+
last_suffix = time[-2, 2]
|
224
|
+
else
|
225
|
+
time += last_suffix
|
226
|
+
end
|
227
|
+
times[index][:time] = parse_time time
|
228
|
+
end
|
229
|
+
times
|
230
|
+
end
|
231
|
+
|
232
|
+
# Attempts to remove Google redirects from a URL.
|
233
|
+
def self.cleanup_redirects(url)
|
234
|
+
match_data = /.(http\:\/\/.*?)(\&.*)?$/.match url
|
235
|
+
return match_data ? URI.unescape(match_data[1]) : url
|
236
|
+
end
|
237
|
+
|
238
|
+
# Parses a showtime returned by Google showtimes.
|
239
|
+
def self.parse_time(timestr)
|
240
|
+
time_parts = /(\d+)\:(\d\d)\W*(\w*)$/.match timestr
|
241
|
+
time = Time.now
|
242
|
+
if time_parts
|
243
|
+
is_am = time_parts[3].downcase == 'am'
|
244
|
+
is_pm = time_parts[3].downcase == 'pm'
|
245
|
+
minute = time_parts[2].to_i
|
246
|
+
hour = time_parts[1].to_i
|
247
|
+
if is_pm
|
248
|
+
hour += 12 unless hour == 12
|
249
|
+
elsif is_am
|
250
|
+
hour -= 12 if hour == 12
|
251
|
+
end
|
252
|
+
time = Time.gm(time.year, time.month, time.day, hour, minute, 0)
|
253
|
+
end
|
254
|
+
return time
|
255
|
+
end
|
256
|
+
|
257
|
+
# Parses the disambiguated location from a Google showtimes results page.
|
258
|
+
#
|
259
|
+
# Args:
|
260
|
+
# nokogiri:: a Nokogiri document for the Google showtimes results page
|
261
|
+
#
|
262
|
+
# Returns a string containing the disambiguated location, or nil if no
|
263
|
+
# location is found.
|
264
|
+
def self.parse_location(nokogiri)
|
265
|
+
nokogiri.css('h1').each do |h1|
|
266
|
+
location_match = /^Showtimes for (.*)$/.match h1.text
|
267
|
+
return location_match[1] if location_match
|
268
|
+
end
|
269
|
+
nil
|
270
|
+
end
|
271
|
+
|
272
|
+
# Extracts the URL for the "Next >" link from a Google showtimes results page.
|
273
|
+
#
|
274
|
+
# Args:
|
275
|
+
# nokogiri:: a Nokogiri document for the Google showtimes results page
|
276
|
+
#
|
277
|
+
# Returns the URL, or nil if no Next link exists on the results page.
|
278
|
+
def self.parse_next_link(nokogiri)
|
279
|
+
url = nil
|
280
|
+
nokogiri.css('a').each do |a|
|
281
|
+
url = a['href'] if a.text.strip == 'Next'
|
282
|
+
end
|
283
|
+
url
|
284
|
+
end
|
285
|
+
end
|