google_showtimes 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,24 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## ECLIPSE
17
+ .loadpath
18
+
19
+ ## PROJECT::GENERAL
20
+ coverage
21
+ rdoc
22
+ pkg
23
+
24
+ ## PROJECT::SPECIFIC
data/.project ADDED
@@ -0,0 +1,17 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <projectDescription>
3
+ <name>google_showtimes</name>
4
+ <comment></comment>
5
+ <projects>
6
+ </projects>
7
+ <buildSpec>
8
+ <buildCommand>
9
+ <name>org.rubypeople.rdt.core.rubybuilder</name>
10
+ <arguments>
11
+ </arguments>
12
+ </buildCommand>
13
+ </buildSpec>
14
+ <natures>
15
+ <nature>org.rubypeople.rdt.core.rubynature</nature>
16
+ </natures>
17
+ </projectDescription>
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Victor Costan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,20 @@
1
+ = google_showtimes
2
+
3
+ This gem retrieves movie showtimes from Google, by scraping their HTML results.
4
+
5
+ Note: the code was written a long time ago and is quite ugly. On the bright
6
+ side, I already wrote it, so it comes for free.
7
+
8
+ == Note on Patches/Pull Requests
9
+
10
+ * Fork the project.
11
+ * Make your feature addition or bug fix.
12
+ * Add tests for it. This is important so I don't break it in a
13
+ future version unintentionally.
14
+ * Commit, do not mess with rakefile, version, or history.
15
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
16
+ * Send me a pull request. Bonus points for topic branches.
17
+
18
+ == Copyright
19
+
20
+ Copyright (c) 2009 Victor Costan. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,64 @@
1
+ # Rakefile for the google_showtimes gem.
2
+ #
3
+ # Author:: Victor Costan
4
+ # Copyright:: Copyright (C) 2009 Victor Costan
5
+ # License:: MIT
6
+
7
+ require 'rubygems'
8
+ require 'rake'
9
+
10
+ begin
11
+ require 'jeweler'
12
+ Jeweler::Tasks.new do |gem|
13
+ gem.name = "google_showtimes"
14
+ gem.summary = %Q{Movie showtimes from Google}
15
+ gem.description = %Q{Library for scraping Google's showtimes search.}
16
+ gem.email = "costan@gmail.com"
17
+ gem.homepage = "http://github.com/costan/google_showtimes"
18
+ gem.authors = ["Victor Costan"]
19
+ gem.add_dependency 'nokogiri', '>= 1.4.1'
20
+ gem.rubyforge_project = 'zerglings'
21
+
22
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
23
+ end
24
+ Jeweler::RubyforgeTasks.new do |rubyforge|
25
+ rubyforge.doc_task = "rdoc"
26
+ end
27
+
28
+ rescue LoadError
29
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
30
+ end
31
+
32
+ require 'rake/testtask'
33
+ Rake::TestTask.new(:test) do |test|
34
+ test.libs << 'lib' << 'test'
35
+ test.pattern = 'test/**/test_*.rb'
36
+ test.verbose = true
37
+ end
38
+
39
+ begin
40
+ require 'rcov/rcovtask'
41
+ Rcov::RcovTask.new do |test|
42
+ test.libs << 'test'
43
+ test.pattern = 'test/**/test_*.rb'
44
+ test.verbose = true
45
+ end
46
+ rescue LoadError
47
+ task :rcov do
48
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
49
+ end
50
+ end
51
+
52
+ task :test => :check_dependencies
53
+
54
+ task :default => :test
55
+
56
+ require 'rake/rdoctask'
57
+ Rake::RDocTask.new do |rdoc|
58
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
59
+
60
+ rdoc.rdoc_dir = 'rdoc'
61
+ rdoc.title = "google_showtimes #{version}"
62
+ rdoc.rdoc_files.include('README*')
63
+ rdoc.rdoc_files.include('lib/**/*.rb')
64
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.2
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{google_showtimes}
8
+ s.version = "1.0.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Victor Costan"]
12
+ s.date = %q{2009-12-24}
13
+ s.description = %q{Library for scraping Google's showtimes search.}
14
+ s.email = %q{costan@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ ".project",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "google_showtimes.gemspec",
28
+ "lib/google_showtimes.rb",
29
+ "test/fixtures/cinemas_movies.html",
30
+ "test/fixtures/cinemas_movies_uk.html",
31
+ "test/fixtures/movie2_cinemas.html",
32
+ "test/fixtures/movie_cinemas.html",
33
+ "test/helper.rb",
34
+ "test/test_google_showtimes.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/costan/google_showtimes}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubyforge_project = %q{zerglings}
40
+ s.rubygems_version = %q{1.3.5}
41
+ s.summary = %q{Movie showtimes from Google}
42
+ s.test_files = [
43
+ "test/test_google_showtimes.rb",
44
+ "test/helper.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
52
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
53
+ else
54
+ s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
55
+ end
56
+ else
57
+ s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,285 @@
1
+ # Scraper for Google showtime search.
2
+ #
3
+ # Author:: Victor Costan
4
+ # Copyright:: Copyright (C) 2009 Victor Costan
5
+ # License:: MIT
6
+
7
+ require 'net/http'
8
+ require 'set'
9
+ require 'uri'
10
+
11
+ require 'nokogiri'
12
+
13
+ # Scraper for Google's movie showtimes search.
14
+ #
15
+ # The #for method is the only method intended to be used by client code. See its
16
+ # documentation to get started.
17
+ module GoogleShowtimes
18
+ # Searches Google (google.com/movies) for movie showtimes.
19
+ #
20
+ # Args:
21
+ # movie:: the name of the movies
22
+ # if nil, will retrieve all the showtimes at the given location
23
+ # location:: a string containing the location to search for
24
+ # Google is awesome at geocoding, so throw in zipcodes,
25
+ # addresses, cities, or hoods
26
+ #
27
+ # Returns a string containing the Google-disambiguated location, and an array
28
+ # of hashes. One hash has showtimes for a film at a cinema and looks like
29
+ # this:
30
+ # { :cinema => { :name => 'AMC 13', :address => '1998 Broadway, ....' },
31
+ # :film => { :name => 'Dark Knight', :imdb => '0123456' },
32
+ # :showtimes => [ { :time => '11:30am' },
33
+ # { :time => '1:00', :href => 'site selling tickets' } ]
34
+ # }
35
+ def self.for(location, movie = nil)
36
+ query = if movie
37
+ "/movies?q=#{URI.encode(movie)}&near=#{URI.encode(location)}"
38
+ else
39
+ "/movies?near=#{URI.encode(location)}"
40
+ end
41
+
42
+ results = []
43
+ google_location = nil
44
+ while query
45
+ response = Net::HTTP.start("google.com", 80) { |http| http.get query }
46
+ unless response.kind_of? Net::HTTPSuccess
47
+ return nil
48
+ end
49
+
50
+ partial_results, location, query = parse_results response.body
51
+ google_location ||= location
52
+ results += partial_results
53
+ end
54
+ return google_location, results
55
+ end
56
+
57
+ # Parses a Google showtimes results page.
58
+ #
59
+ # Args:
60
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
61
+ #
62
+ # Returns an array of results, a string containing the Google-disambiguated
63
+ # location, and a string containing the URL for the 'Next >' link. The first
64
+ # two return values are structured like return of the #for method. The last
65
+ # string may be nil if the results page contains no 'Next >' link.
66
+ def self.parse_results(page_contents)
67
+ nokogiri = Nokogiri::HTML page_contents
68
+
69
+ location = parse_location nokogiri
70
+ next_url = parse_next_link nokogiri
71
+ results = []
72
+
73
+ theater, movie = nil, nil
74
+ parse_results_fast nokogiri do |info_type, info|
75
+ case info_type
76
+ when :movie
77
+ movie = info
78
+ when :theater
79
+ theater = info
80
+ when :times
81
+ results << { :film => movie, :cinema => theater, :showtimes => info }
82
+ end
83
+ end
84
+ return results, location, next_url
85
+ end
86
+
87
+ # Parses a Google showtimes results page.
88
+ #
89
+ # This method uses a fast parsing method, assuming the well-behaved output
90
+ # page produced by Google at the time of the gem's writing.
91
+ #
92
+ # Args:
93
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
94
+ #
95
+ # Yields a symbol and information hash for every piece of information found.
96
+ # The symbol is either +:theater+, +:movie+, or +:times+. The information is
97
+ # the same as in the #for method.
98
+ def self.parse_results_fast(nokogiri, &block)
99
+ query = '//div[@class="movie" or @class="theater" or @class="times"]'
100
+ nokogiri.xpath(query).each do |div|
101
+ case div['class']
102
+ when 'theater'
103
+ if info = parse_theater_fast(div)
104
+ yield :theater, info
105
+ end
106
+ when 'movie'
107
+ if info = parse_movie_fast(div)
108
+ yield :movie, info
109
+ end
110
+ when 'times'
111
+ if info = parse_showing_times(div)
112
+ yield :times, info
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ # Parses movie theater information in a Google showtime results page.
119
+ #
120
+ # This method uses a fast parsing method, assuming the well-behaved output
121
+ # page produced by Google at the time of the gem's writing.
122
+ #
123
+ # Args:
124
+ # nokogiri:: a Nokogiri node containing the movie theater data
125
+ #
126
+ # Returns a hash with the keys +:name+ and +:address+, or nil if parsing
127
+ # failed.
128
+ def self.parse_theater_fast(nokogiri)
129
+ name_elem = nokogiri.css('.name').first
130
+ address_elem = nokogiri.css('.address').first || nokogiri.css('.info').first
131
+ if name_elem && address_elem
132
+ address, phone = address_phone(address_elem.text)
133
+ info = { :name => name_elem.text, :address => address }
134
+ info[:phone] = phone if phone
135
+ return info
136
+ end
137
+ nil
138
+ end
139
+
140
+ # Attempts to extract the phone number from an address+phone string.
141
+ #
142
+ # Args:
143
+ # text:: Google showtimes string containing an address and phone number
144
+ #
145
+ # Returns two strings containing the address, and the phone number. The phone
146
+ # number will be nil if the strings could not be separated.
147
+ #
148
+ # Example:
149
+ # a, p = address_phone('234 West 42nd St., New York - (212) 398-3939')
150
+ def self.address_phone(text)
151
+ # The biggest suffix that consists of non-word characters.
152
+ # HACK: One x is allowed, for extension: (800) 326-3264 x771
153
+ ph_number = text.scan(
154
+ /[[:digit:][:punct:][:space:]]+(?:x[[:digit:][:punct:][:space:]]+)?$/u).
155
+ sort_by(&:length).last
156
+ return text, nil unless ph_number
157
+
158
+ address = text[0, text.length - ph_number.length]
159
+ ph_number.gsub! /^\s*\-\s*/, ''
160
+
161
+ # If it has 50% digits, it's good.
162
+ digit_count = ph_number.scan(/\d/u).length
163
+ return text, nil unless digit_count * 2 >= ph_number.length
164
+ return address, ph_number
165
+ end
166
+
167
+ # Parses movie information in a Google showtime results page.
168
+ #
169
+ # This method uses a fast parsing method, assuming the well-behaved output
170
+ # page produced by Google at the time of the gem's writing.
171
+ #
172
+ # Args:
173
+ # nokogiri:: a Nokogiri node containing the movie data
174
+ #
175
+ # Returns a hash with the keys +:name+ and +:imdb+, or nil if parsing failed.
176
+ def self.parse_movie_fast(nokogiri)
177
+ name_elem = nokogiri.css('div.desc h2').first
178
+ name_elem ||= nokogiri.css('.name').first
179
+
180
+ imdb = nil
181
+ nokogiri.css('a').each do |a|
182
+ match_data = /imdb\.com\/title\/tt(\d*)\//.match a['href']
183
+ next unless match_data
184
+
185
+ imdb = match_data[1]
186
+ return { :name => name_elem.text, :imdb => imdb }
187
+ end
188
+ nil
189
+ end
190
+
191
+ # Parses showing times information in a Google showtime results page.
192
+ #
193
+ # Args:
194
+ # nokogiri:: a Nokogiri node containing the showing times data
195
+ #
196
+ # Returns a hash with the keys +:time+ and (optionally) +:href+, or nil if
197
+ # parsing failed.
198
+ def self.parse_showing_times(nokogiri)
199
+ times = []
200
+ time_set = Set.new
201
+
202
+ # Parse times with ticket buying links.
203
+ nokogiri.css('a').each do |a|
204
+ next unless /\d\:\d\d/ =~ a.text
205
+ time_set << a.text
206
+ times << { :time => a.text, :href => cleanup_redirects(a['href']) }
207
+ end
208
+
209
+ # Parse plaintext times.
210
+ nokogiri.text.split.each do |time_text|
211
+ time_text.gsub!(/[^\d\:amp]/, '')
212
+ next unless /\d\:\d\d/ =~ time_text
213
+ next if time_set.include? time_text
214
+ times << { :time => time_text }
215
+ end
216
+
217
+ # Parse text-form time into Time objects.
218
+ last_suffix = ''
219
+ (times.length - 1).downto(0) do |index|
220
+ time = times[index][:time]
221
+
222
+ if ['am', 'pm'].include? time[-2, 2]
223
+ last_suffix = time[-2, 2]
224
+ else
225
+ time += last_suffix
226
+ end
227
+ times[index][:time] = parse_time time
228
+ end
229
+ times
230
+ end
231
+
232
+ # Attempts to remove Google redirects from a URL.
233
+ def self.cleanup_redirects(url)
234
+ match_data = /.(http\:\/\/.*?)(\&.*)?$/.match url
235
+ return match_data ? URI.unescape(match_data[1]) : url
236
+ end
237
+
238
+ # Parses a showtime returned by Google showtimes.
239
+ def self.parse_time(timestr)
240
+ time_parts = /(\d+)\:(\d\d)\W*(\w*)$/.match timestr
241
+ time = Time.now
242
+ if time_parts
243
+ is_am = time_parts[3].downcase == 'am'
244
+ is_pm = time_parts[3].downcase == 'pm'
245
+ minute = time_parts[2].to_i
246
+ hour = time_parts[1].to_i
247
+ if is_pm
248
+ hour += 12 unless hour == 12
249
+ elsif is_am
250
+ hour -= 12 if hour == 12
251
+ end
252
+ time = Time.gm(time.year, time.month, time.day, hour, minute, 0)
253
+ end
254
+ return time
255
+ end
256
+
257
+ # Parses the disambiguated location from a Google showtimes results page.
258
+ #
259
+ # Args:
260
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
261
+ #
262
+ # Returns a string containing the disambiguated location, or nil if no
263
+ # location is found.
264
+ def self.parse_location(nokogiri)
265
+ nokogiri.css('h1').each do |h1|
266
+ location_match = /^Showtimes for (.*)$/.match h1.text
267
+ return location_match[1] if location_match
268
+ end
269
+ nil
270
+ end
271
+
272
+ # Extracts the URL for the "Next >" link from a Google showtimes results page.
273
+ #
274
+ # Args:
275
+ # nokogiri:: a Nokogiri document for the Google showtimes results page
276
+ #
277
+ # Returns the URL, or nil if no Next link exists on the results page.
278
+ def self.parse_next_link(nokogiri)
279
+ url = nil
280
+ nokogiri.css('a').each do |a|
281
+ url = a['href'] if a.text.strip == 'Next'
282
+ end
283
+ url
284
+ end
285
+ end