google_showtimes 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,24 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## ECLIPSE
17
+ .loadpath
18
+
19
+ ## PROJECT::GENERAL
20
+ coverage
21
+ rdoc
22
+ pkg
23
+
24
+ ## PROJECT::SPECIFIC
data/.project ADDED
@@ -0,0 +1,17 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <projectDescription>
3
+ <name>google_showtimes</name>
4
+ <comment></comment>
5
+ <projects>
6
+ </projects>
7
+ <buildSpec>
8
+ <buildCommand>
9
+ <name>org.rubypeople.rdt.core.rubybuilder</name>
10
+ <arguments>
11
+ </arguments>
12
+ </buildCommand>
13
+ </buildSpec>
14
+ <natures>
15
+ <nature>org.rubypeople.rdt.core.rubynature</nature>
16
+ </natures>
17
+ </projectDescription>
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Victor Costan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,20 @@
1
+ = google_showtimes
2
+
3
+ This gem retrieves movie showtimes from Google, by scraping their HTML results.
4
+
5
+ Note: the code was written a long time ago and is quite ugly. On the bright
6
+ side, I already wrote it, so it comes for free.
7
+
8
+ == Note on Patches/Pull Requests
9
+
10
+ * Fork the project.
11
+ * Make your feature addition or bug fix.
12
+ * Add tests for it. This is important so I don't break it in a
13
+ future version unintentionally.
14
+ * Commit, do not mess with rakefile, version, or history.
15
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
16
+ * Send me a pull request. Bonus points for topic branches.
17
+
18
+ == Copyright
19
+
20
+ Copyright (c) 2009 Victor Costan. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,64 @@
1
+ # Rakefile for the google_showtimes gem.
2
+ #
3
+ # Author:: Victor Costan
4
+ # Copyright:: Copyright (C) 2009 Victor Costan
5
+ # License:: MIT
6
+
7
+ require 'rubygems'
8
+ require 'rake'
9
+
10
+ begin
11
+ require 'jeweler'
12
+ Jeweler::Tasks.new do |gem|
13
+ gem.name = "google_showtimes"
14
+ gem.summary = %Q{Movie showtimes from Google}
15
+ gem.description = %Q{Library for scraping Google's showtimes search.}
16
+ gem.email = "costan@gmail.com"
17
+ gem.homepage = "http://github.com/costan/google_showtimes"
18
+ gem.authors = ["Victor Costan"]
19
+ gem.add_dependency 'nokogiri', '>= 1.4.1'
20
+ gem.rubyforge_project = 'zerglings'
21
+
22
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
23
+ end
24
+ Jeweler::RubyforgeTasks.new do |rubyforge|
25
+ rubyforge.doc_task = "rdoc"
26
+ end
27
+
28
+ rescue LoadError
29
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
30
+ end
31
+
32
+ require 'rake/testtask'
33
+ Rake::TestTask.new(:test) do |test|
34
+ test.libs << 'lib' << 'test'
35
+ test.pattern = 'test/**/test_*.rb'
36
+ test.verbose = true
37
+ end
38
+
39
+ begin
40
+ require 'rcov/rcovtask'
41
+ Rcov::RcovTask.new do |test|
42
+ test.libs << 'test'
43
+ test.pattern = 'test/**/test_*.rb'
44
+ test.verbose = true
45
+ end
46
+ rescue LoadError
47
+ task :rcov do
48
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
49
+ end
50
+ end
51
+
52
+ task :test => :check_dependencies
53
+
54
+ task :default => :test
55
+
56
+ require 'rake/rdoctask'
57
+ Rake::RDocTask.new do |rdoc|
58
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
59
+
60
+ rdoc.rdoc_dir = 'rdoc'
61
+ rdoc.title = "google_showtimes #{version}"
62
+ rdoc.rdoc_files.include('README*')
63
+ rdoc.rdoc_files.include('lib/**/*.rb')
64
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.2
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{google_showtimes}
8
+ s.version = "1.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Victor Costan"]
12
+ s.date = %q{2009-12-24}
13
+ s.description = %q{Library for scraping Google's showtimes search.}
14
+ s.email = %q{costan@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ ".project",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "google_showtimes.gemspec",
28
+ "lib/google_showtimes.rb",
29
+ "test/fixtures/cinemas_movies.html",
30
+ "test/fixtures/cinemas_movies_uk.html",
31
+ "test/fixtures/movie2_cinemas.html",
32
+ "test/fixtures/movie_cinemas.html",
33
+ "test/helper.rb",
34
+ "test/test_google_showtimes.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/costan/google_showtimes}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubyforge_project = %q{zerglings}
40
+ s.rubygems_version = %q{1.3.5}
41
+ s.summary = %q{Movie showtimes from Google}
42
+ s.test_files = [
43
+ "test/test_google_showtimes.rb",
44
+ "test/helper.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
52
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
53
+ else
54
+ s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,285 @@
1
+ # Scraper for Google showtime search.
2
+ #
3
+ # Author:: Victor Costan
4
+ # Copyright:: Copyright (C) 2009 Victor Costan
5
+ # License:: MIT
6
+
7
+ require 'net/http'
8
+ require 'set'
9
+ require 'uri'
10
+
11
+ require 'nokogiri'
12
+
13
+ # Scraper for Google's movie showtimes search.
14
+ #
15
+ # The #for method is the only method intended to be used by client code. See its
16
+ # documentation to get started.
17
+ module GoogleShowtimes
18
+ # Searches Google (google.com/movies) for movie showtimes.
19
+ #
20
+ # Args:
21
+ # movie:: the name of the movies
22
+ # if nil, will retrieve all the showtimes at the given location
23
+ # location:: a string containing the location to search for
24
+ # Google is awesome at geocoding, so throw in zipcodes,
25
+ # addresses, cities, or hoods
26
+ #
27
+ # Returns a string containing the Google-disambiguated location, and an array
28
+ # of hashes. One hash has showtimes for a film at a cinema and looks like
29
+ # this:
30
+ # { :cinema => { :name => 'AMC 13', :address => '1998 Broadway, ....' },
31
+ # :film => { :name => 'Dark Knight', :imdb => '0123456' },
32
+ # :showtimes => [ { :time => '11:30am' },
33
+ # { :time => '1:00', :href => 'site selling tickets' } ]
34
+ # }
35
+ def self.for(location, movie = nil)
36
+ query = if movie
37
+ "/movies?q=#{URI.encode(movie)}&near=#{URI.encode(location)}"
38
+ else
39
+ "/movies?near=#{URI.encode(location)}"
40
+ end
41
+
42
+ results = []
43
+ google_location = nil
44
+ while query
45
+ response = Net::HTTP.start("google.com", 80) { |http| http.get query }
46
+ unless response.kind_of? Net::HTTPSuccess
47
+ return nil
48
+ end
49
+
50
+ partial_results, location, query = parse_results response.body
51
+ google_location ||= location
52
+ results += partial_results
53
+ end
54
+ return google_location, results
55
+ end
56
+
57
+ # Parses a Google showtimes results page.
58
+ #
59
+ # Args:
60
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
61
+ #
62
+ # Returns an array of results, a string containing the Google-disambiguated
63
+ # location, and a string containing the URL for the 'Next >' link. The first
64
+ # two return values are structured like return of the #for method. The last
65
+ # string may be nil if the results page contains no 'Next >' link.
66
+ def self.parse_results(page_contents)
67
+ nokogiri = Nokogiri::HTML page_contents
68
+
69
+ location = parse_location nokogiri
70
+ next_url = parse_next_link nokogiri
71
+ results = []
72
+
73
+ theater, movie = nil, nil
74
+ parse_results_fast nokogiri do |info_type, info|
75
+ case info_type
76
+ when :movie
77
+ movie = info
78
+ when :theater
79
+ theater = info
80
+ when :times
81
+ results << { :film => movie, :cinema => theater, :showtimes => info }
82
+ end
83
+ end
84
+ return results, location, next_url
85
+ end
86
+
87
+ # Parses a Google showtimes results page.
88
+ #
89
+ # This method uses a fast parsing method, assuming the well-behaved output
90
+ # page produced by Google at the time of the gem's writing.
91
+ #
92
+ # Args:
93
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
94
+ #
95
+ # Yields a symbol and information hash for every piece of information found.
96
+ # The symbol is either +:theater+, +:movie+, or +:times+. The information is
97
+ # the same as in the #for method.
98
+ def self.parse_results_fast(nokogiri, &block)
99
+ query = '//div[@class="movie" or @class="theater" or @class="times"]'
100
+ nokogiri.xpath(query).each do |div|
101
+ case div['class']
102
+ when 'theater'
103
+ if info = parse_theater_fast(div)
104
+ yield :theater, info
105
+ end
106
+ when 'movie'
107
+ if info = parse_movie_fast(div)
108
+ yield :movie, info
109
+ end
110
+ when 'times'
111
+ if info = parse_showing_times(div)
112
+ yield :times, info
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ # Parses movie theater information in a Google showtime results page.
119
+ #
120
+ # This method uses a fast parsing method, assuming the well-behaved output
121
+ # page produced by Google at the time of the gem's writing.
122
+ #
123
+ # Args:
124
+ # nokogiri:: a Nokogiri node containing the movie theater data
125
+ #
126
+ # Returns a hash with the keys +:name+ and +:address+, or nil if parsing
127
+ # failed.
128
+ def self.parse_theater_fast(nokogiri)
129
+ name_elem = nokogiri.css('.name').first
130
+ address_elem = nokogiri.css('.address').first || nokogiri.css('.info').first
131
+ if name_elem && address_elem
132
+ address, phone = address_phone(address_elem.text)
133
+ info = { :name => name_elem.text, :address => address }
134
+ info[:phone] = phone if phone
135
+ return info
136
+ end
137
+ nil
138
+ end
139
+
140
+ # Attempts to extract the phone number from an address+phone string.
141
+ #
142
+ # Args:
143
+ # text:: Google showtimes string containing an address and phone number
144
+ #
145
+ # Returns two strings containing the address, and the phone number. The phone
146
+ # number will be nil if the strings could not be separated.
147
+ #
148
+ # Example:
149
+ # a, p = address_phone('234 West 42nd St., New York - (212) 398-3939')
150
+ def self.address_phone(text)
151
+ # The biggest suffix that consists of non-word characters.
152
+ # HACK: One x is allowed, for extension: (800) 326-3264 x771
153
+ ph_number = text.scan(
154
+ /[[:digit:][:punct:][:space:]]+(?:x[[:digit:][:punct:][:space:]]+)?$/u).
155
+ sort_by(&:length).last
156
+ return text, nil unless ph_number
157
+
158
+ address = text[0, text.length - ph_number.length]
159
+ ph_number.gsub! /^\s*\-\s*/, ''
160
+
161
+ # If it has 50% digits, it's good.
162
+ digit_count = ph_number.scan(/\d/u).length
163
+ return text, nil unless digit_count * 2 >= ph_number.length
164
+ return address, ph_number
165
+ end
166
+
167
+ # Parses movie information in a Google showtime results page.
168
+ #
169
+ # This method uses a fast parsing method, assuming the well-behaved output
170
+ # page produced by Google at the time of the gem's writing.
171
+ #
172
+ # Args:
173
+ # nokogiri:: a Nokogiri node containing the movie data
174
+ #
175
+ # Returns a hash with the keys +:name+ and +:imdb+, or nil if parsing failed.
176
+ def self.parse_movie_fast(nokogiri)
177
+ name_elem = nokogiri.css('div.desc h2').first
178
+ name_elem ||= nokogiri.css('.name').first
179
+
180
+ imdb = nil
181
+ nokogiri.css('a').each do |a|
182
+ match_data = /imdb\.com\/title\/tt(\d*)\//.match a['href']
183
+ next unless match_data
184
+
185
+ imdb = match_data[1]
186
+ return { :name => name_elem.text, :imdb => imdb }
187
+ end
188
+ nil
189
+ end
190
+
191
+ # Parses showing times information in a Google showtime results page.
192
+ #
193
+ # Args:
194
+ # nokogiri:: a Nokogiri node containing the showing times data
195
+ #
196
+ # Returns a hash with the keys +:time+ and (optionally) +:href+, or nil if
197
+ # parsing failed.
198
+ def self.parse_showing_times(nokogiri)
199
+ times = []
200
+ time_set = Set.new
201
+
202
+ # Parse times with ticket buying links.
203
+ nokogiri.css('a').each do |a|
204
+ next unless /\d\:\d\d/ =~ a.text
205
+ time_set << a.text
206
+ times << { :time => a.text, :href => cleanup_redirects(a['href']) }
207
+ end
208
+
209
+ # Parse plaintext times.
210
+ nokogiri.text.split.each do |time_text|
211
+ time_text.gsub!(/[^\d\:amp]/, '')
212
+ next unless /\d\:\d\d/ =~ time_text
213
+ next if time_set.include? time_text
214
+ times << { :time => time_text }
215
+ end
216
+
217
+ # Parse text-form time into Time objects.
218
+ last_suffix = ''
219
+ (times.length - 1).downto(0) do |index|
220
+ time = times[index][:time]
221
+
222
+ if ['am', 'pm'].include? time[-2, 2]
223
+ last_suffix = time[-2, 2]
224
+ else
225
+ time += last_suffix
226
+ end
227
+ times[index][:time] = parse_time time
228
+ end
229
+ times
230
+ end
231
+
232
+ # Attempts to remove Google redirects from a URL.
233
+ def self.cleanup_redirects(url)
234
+ match_data = /.(http\:\/\/.*?)(\&.*)?$/.match url
235
+ return match_data ? URI.unescape(match_data[1]) : url
236
+ end
237
+
238
+ # Parses a showtime returned by Google showtimes.
239
+ def self.parse_time(timestr)
240
+ time_parts = /(\d+)\:(\d\d)\W*(\w*)$/.match timestr
241
+ time = Time.now
242
+ if time_parts
243
+ is_am = time_parts[3].downcase == 'am'
244
+ is_pm = time_parts[3].downcase == 'pm'
245
+ minute = time_parts[2].to_i
246
+ hour = time_parts[1].to_i
247
+ if is_pm
248
+ hour += 12 unless hour == 12
249
+ elsif is_am
250
+ hour -= 12 if hour == 12
251
+ end
252
+ time = Time.gm(time.year, time.month, time.day, hour, minute, 0)
253
+ end
254
+ return time
255
+ end
256
+
257
+ # Parses the disambiguated location from a Google showtimes results page.
258
+ #
259
+ # Args:
260
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
261
+ #
262
+ # Returns a string containing the disambiguated location, or nil if no
263
+ # location is found.
264
+ def self.parse_location(nokogiri)
265
+ nokogiri.css('h1').each do |h1|
266
+ location_match = /^Showtimes for (.*)$/.match h1.text
267
+ return location_match[1] if location_match
268
+ end
269
+ nil
270
+ end
271
+
272
+ # Extracts the URL for the "Next >" link from a Google showtimes results page.
273
+ #
274
+ # Args:
275
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
276
+ #
277
+ # Returns the URL, or nil if no Next link exists on the results page.
278
+ def self.parse_next_link(nokogiri)
279
+ url = nil
280
+ nokogiri.css('a').each do |a|
281
+ url = a['href'] if a.text.strip == 'Next'
282
+ end
283
+ url
284
+ end
285
+ end