picturehouse_uk 1.0.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/CHANGELOG.md +18 -0
- data/README.md +7 -10
- data/Rakefile +20 -7
- data/lib/picturehouse_uk.rb +4 -1
- data/lib/picturehouse_uk/cinema.rb +51 -100
- data/lib/picturehouse_uk/film.rb +22 -8
- data/lib/picturehouse_uk/internal/address_parser.rb +72 -0
- data/lib/picturehouse_uk/internal/cinema_page.rb +35 -0
- data/lib/picturehouse_uk/internal/film_with_screenings_parser.rb +87 -37
- data/lib/picturehouse_uk/internal/title_sanitizer.rb +49 -0
- data/lib/picturehouse_uk/internal/website.rb +39 -0
- data/lib/picturehouse_uk/screening.rb +63 -19
- data/lib/picturehouse_uk/version.rb +2 -2
- data/picturehouse_uk.gemspec +11 -11
- data/test/fixture_updater.rb +43 -0
- data/test/fixtures/address-fragments/duke-of-yorks.html +39 -0
- data/test/fixtures/address-fragments/hackney-picturehouse.html +12 -0
- data/test/fixtures/cinema/Duke_Of_Yorks.html +4370 -0
- data/test/fixtures/cinema/Duke_Of_Yorks/film_last.html +45 -0
- data/test/fixtures/cinema/Duke_Of_Yorks/film_second.html +37 -0
- data/test/fixtures/{abbeygate-contact-us.html → contact_us/Duke_Of_Yorks.html} +635 -156
- data/test/fixtures/{dukes-at-komedia-contact-us.html → contact_us/Dukes_At_Komedia.html} +582 -98
- data/test/fixtures/{picturehouses-homepage.html → home.html} +642 -146
- data/test/lib/picturehouse_uk/cinema_test.rb +127 -307
- data/test/lib/picturehouse_uk/film_test.rb +65 -16
- data/test/lib/picturehouse_uk/internal/address_parser_test.rb +55 -0
- data/test/lib/picturehouse_uk/internal/cinema_page_test.rb +51 -0
- data/test/lib/picturehouse_uk/internal/film_with_screenings_parser_test.rb +44 -151
- data/test/lib/picturehouse_uk/internal/title_sanitizer_test.rb +131 -0
- data/test/lib/picturehouse_uk/internal/website_test.rb +64 -0
- data/test/lib/picturehouse_uk/screening_test.rb +149 -21
- data/test/live/integration_test.rb +68 -0
- data/test/test_helper.rb +3 -1
- metadata +40 -43
- data/test/fixtures/dukes-at-komedia-cinema.html +0 -7148
- data/test/fixtures/film_node/blue-jasmine-done.html +0 -53
- data/test/fixtures/film_node/blue-jasmine-future.html +0 -55
- data/test/fixtures/film_node/bolshoi-spartacus.html +0 -26
- data/test/fixtures/film_node/captain-phillips-with-silver-screen-and-subtitles.html +0 -103
- data/test/fixtures/film_node/fifth-estate-with-big-scream.html +0 -73
- data/test/fixtures/film_node/london-film-festival-with-toddler-time.html +0 -46
- data/test/fixtures/film_node/met-encore-rusalka-as-live.html +0 -26
- data/test/fixtures/film_node/nt-encore-hamlet.html +0 -26
- data/test/fixtures/film_node/planes-with-kids-club.html +0 -77
- data/test/fixtures/film_node/royal-opera-house-don-quixote.html +0 -26
- data/test/fixtures/film_node/rsc-encore-richard-ii.html +0 -28
- data/test/fixtures/film_node/rsc-live-richard-ii.html +0 -41
- data/test/fixtures/film_node/rsc-live-the-two-gentlemen-of-verona-zero-cert.html +0 -19
- data/test/fixtures/hackney-contact-us.html +0 -998
@@ -0,0 +1,35 @@
|
|
1
|
+
module PicturehouseUk
|
2
|
+
# @api private
|
3
|
+
module Internal
|
4
|
+
# Parses a chunk of HTML to derive movie showing data
|
5
|
+
class CinemaPage
|
6
|
+
# css for a film + screenings
|
7
|
+
FILM_CSS = '#events .largelist .item'
|
8
|
+
|
9
|
+
# @param [Integer] cinema_id cineworld cinema id
|
10
|
+
def initialize(cinema_id)
|
11
|
+
@cinema_id = cinema_id
|
12
|
+
end
|
13
|
+
|
14
|
+
# break up the page into individual chunks for each film
|
15
|
+
# @return [Array<String>] html chunks for a film and it's screenings
|
16
|
+
def film_html
|
17
|
+
film_nodes.map { |n| n.to_s.gsub(/^\s+/, '') }
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def cinema
|
23
|
+
@cinema ||= PicturehouseUk::Internal::Website.new.cinema(@cinema_id)
|
24
|
+
end
|
25
|
+
|
26
|
+
def cinema_doc
|
27
|
+
@cinema_doc ||= Nokogiri::HTML(cinema)
|
28
|
+
end
|
29
|
+
|
30
|
+
def film_nodes
|
31
|
+
cinema_doc.css(FILM_CSS)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -1,59 +1,109 @@
|
|
1
1
|
module PicturehouseUk
|
2
|
-
|
3
|
-
# Internal utility classes: Do not use
|
4
2
|
# @api private
|
5
3
|
module Internal
|
6
|
-
|
7
4
|
# Parses a chunk of HTML to derive movie showing data
|
8
5
|
class FilmWithScreeningsParser
|
6
|
+
# film name css
|
7
|
+
FILM_NAME_CSS = '.movielink'
|
8
|
+
# showings css
|
9
|
+
SCREENING_CSS = 'a[epoch]'
|
9
10
|
|
10
|
-
# @param [
|
11
|
-
def initialize(
|
12
|
-
@
|
11
|
+
# @param [Nokogiri::HTML] film_html a chunk of html
|
12
|
+
def initialize(html)
|
13
|
+
@html = html
|
13
14
|
end
|
14
15
|
|
15
16
|
# The film name
|
16
17
|
# @return [String]
|
17
18
|
def film_name
|
18
|
-
|
19
|
+
TitleSanitizer.new(raw_film_name).sanitized
|
20
|
+
end
|
21
|
+
|
22
|
+
# Showings hashes
|
23
|
+
# @return [Array<Hash>]
|
24
|
+
def to_a
|
25
|
+
return [] unless screenings?
|
26
|
+
screening_nodes.map do |node|
|
27
|
+
{
|
28
|
+
film_name: film_name,
|
29
|
+
dimension: dimension
|
30
|
+
}.merge(ScreeningParser.new(node).to_hash)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def dimension
|
37
|
+
raw_film_name.match(/3d/i) ? '3d' : '2d'
|
38
|
+
end
|
39
|
+
|
40
|
+
def doc
|
41
|
+
@doc ||= Nokogiri::HTML(@html)
|
42
|
+
end
|
43
|
+
|
44
|
+
def raw_film_name
|
45
|
+
@raw_film_name ||= doc.css(FILM_NAME_CSS).children.first.to_s
|
46
|
+
end
|
19
47
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
name = name.gsub /\s\[\]/, '' # remove no certificate
|
24
|
-
name = name.gsub /\s+[23][dD]/, '' # remove 2d or 3d from title
|
48
|
+
def screening_nodes
|
49
|
+
@screening_nodes ||= doc.css(SCREENING_CSS)
|
50
|
+
end
|
25
51
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
name = name.gsub 'RSC Live:', 'Royal Shakespeare Company:' # RSC
|
31
|
-
name = name.gsub 'RSC Encore:', 'Royal Shakespeare Company:' # RSC
|
52
|
+
def screenings?
|
53
|
+
!!screening_nodes
|
54
|
+
end
|
55
|
+
end
|
32
56
|
|
33
|
-
|
34
|
-
|
35
|
-
|
57
|
+
# parse an individual screening node
|
58
|
+
class ScreeningParser
|
59
|
+
# @param [Nokogiri::HTML] node a node with a film screening
|
60
|
+
def initialize(node)
|
61
|
+
@node = node
|
36
62
|
end
|
37
63
|
|
38
|
-
#
|
64
|
+
# is the screening bookable?
|
65
|
+
# @return [Boolean]
|
66
|
+
def bookable?
|
67
|
+
!!booking_url
|
68
|
+
end
|
69
|
+
|
70
|
+
# the attributes of a single screening
|
39
71
|
# @return [Hash]
|
40
72
|
# @example
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
time = Time.utc(1970)+link['epoch'].to_i
|
73
|
+
# Cineworld::Internal::ScreeningParser.new(html).to_hash
|
74
|
+
# => {
|
75
|
+
# booking_url: 'http://...',
|
76
|
+
# time: <Time>,
|
77
|
+
# variant: ['imax']
|
78
|
+
# }
|
79
|
+
def to_hash
|
80
|
+
{
|
81
|
+
booking_url: "http://www.picturehouses.co.uk#{booking_url}",
|
82
|
+
time: time,
|
83
|
+
variant: variant
|
84
|
+
}
|
85
|
+
end
|
55
86
|
|
56
|
-
|
87
|
+
private
|
88
|
+
|
89
|
+
def booking_url
|
90
|
+
@booking_url ||= @node['html']
|
91
|
+
end
|
92
|
+
|
93
|
+
def time
|
94
|
+
@time ||= Time.utc(1970) + @node['epoch'].to_i
|
95
|
+
end
|
96
|
+
|
97
|
+
def variant
|
98
|
+
@variant ||= begin
|
99
|
+
case @node['class']
|
100
|
+
when /big_scream/ then ['baby']
|
101
|
+
when /kids_club|toddler_time/ then ['kids']
|
102
|
+
when /silver_screen/ then ['silver']
|
103
|
+
when /subtitled_cinema/ then ['subtitled']
|
104
|
+
else
|
105
|
+
[]
|
106
|
+
end
|
57
107
|
end
|
58
108
|
end
|
59
109
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module PicturehouseUk
|
2
|
+
# @api private
|
3
|
+
module Internal
|
4
|
+
# Sanitize and standardize film titles
|
5
|
+
class TitleSanitizer
|
6
|
+
# strings and regex to be removed
|
7
|
+
REMOVE = [
|
8
|
+
/\s\[(AS LIVE: )?[ACPGU1258]+\]/, # regular certificate
|
9
|
+
/\s+[23][dD]/, # 2d or 3d from title
|
10
|
+
/\s\[NO CERT\]/, # no certificate
|
11
|
+
/\s\[\]/, # blank certificate
|
12
|
+
/ourscreen\: /, # ourscreen
|
13
|
+
/\s\(Re(\: \d{0,4})?\)/i, # Re-release
|
14
|
+
/\s\[CERT TBC\]/, # certificate TBC
|
15
|
+
]
|
16
|
+
|
17
|
+
# regexes and their replacements
|
18
|
+
REPLACE = {
|
19
|
+
/Met\.? Encore: (.*)/ => 'Met Opera:',
|
20
|
+
/Met\.? Opera: (.*)/ => 'Met Opera: ',
|
21
|
+
/NT Encore: (.*)/ => 'National Theatre:',
|
22
|
+
/NT Live: (.*)/ => 'National Theatre:',
|
23
|
+
/ROH\.? Live: (.*)/ => 'Royal Opera House:',
|
24
|
+
/RSC\.? Live: (.*)/ => 'Royal Shakespeare Company:',
|
25
|
+
/RSC\.? Encore: (.*)/ => 'Royal Shakespeare Company:'
|
26
|
+
}
|
27
|
+
|
28
|
+
# @param [String] title a film title
|
29
|
+
def initialize(title)
|
30
|
+
@title = title
|
31
|
+
end
|
32
|
+
|
33
|
+
# sanitized and standardized title
|
34
|
+
# @return [String] title
|
35
|
+
def sanitized
|
36
|
+
@sanitzed ||= begin
|
37
|
+
sanitized = @title
|
38
|
+
REMOVE.each do |pattern|
|
39
|
+
sanitized.gsub! pattern, ''
|
40
|
+
end
|
41
|
+
REPLACE.each do |pattern, prefix|
|
42
|
+
sanitized.gsub!(pattern) { |_| prefix + $1 }
|
43
|
+
end
|
44
|
+
sanitized.squeeze(' ').strip
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
module PicturehouseUk
|
4
|
+
# @api private
|
5
|
+
module Internal
|
6
|
+
# fetches pages from the picturehouse.co.uk website
|
7
|
+
class Website
|
8
|
+
# get the cinema page with showings for passed id
|
9
|
+
# @return [String]
|
10
|
+
def cinema(id)
|
11
|
+
get("cinema/#{id}/")
|
12
|
+
end
|
13
|
+
|
14
|
+
# get the cinema contact information page for passed id
|
15
|
+
# @return [String]
|
16
|
+
def contact_us(id)
|
17
|
+
get("cinema/#{id}/Hires_Info/Contact_Us/")
|
18
|
+
end
|
19
|
+
|
20
|
+
# get the home page
|
21
|
+
# @return [String]
|
22
|
+
def home
|
23
|
+
get(nil)
|
24
|
+
end
|
25
|
+
|
26
|
+
# get the cinema page containing all upcoming films and screenings
|
27
|
+
# @return [String]
|
28
|
+
def whatson(id)
|
29
|
+
get("whatson?cinema=#{id}")
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def get(path)
|
35
|
+
open("http://www.picturehouses.co.uk/#{path}").read
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -1,36 +1,80 @@
|
|
1
1
|
module PicturehouseUk
|
2
|
-
|
3
2
|
# A single screening of a film on the Picturehouse UK website
|
4
3
|
class Screening
|
5
|
-
|
4
|
+
# @return [String] the booking URL on the cinema website
|
5
|
+
attr_reader :booking_url
|
6
6
|
# @return [String] the cinema name
|
7
7
|
attr_reader :cinema_name
|
8
|
+
# @return [String] 2d or 3d
|
9
|
+
attr_reader :dimension
|
8
10
|
# @return [String] the film name
|
9
11
|
attr_reader :film_name
|
10
|
-
# @return [Time] the UTC time of the screening
|
11
|
-
attr_reader :when
|
12
|
-
# @return [String] the type of screening (2D, 3D, IMAX...)
|
13
|
-
attr_reader :variant
|
14
12
|
|
15
|
-
# @param [
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
@
|
21
|
-
@
|
13
|
+
# @param [Hash] options
|
14
|
+
def initialize(options)
|
15
|
+
@booking_url = options.fetch(:booking_url, nil)
|
16
|
+
@cinema_name = options.fetch(:cinema_name)
|
17
|
+
@cinema_id = options.fetch(:cinema_id)
|
18
|
+
@dimension = options.fetch(:dimension, '2d')
|
19
|
+
@film_name = options.fetch(:film_name)
|
20
|
+
@time = options.fetch(:time)
|
21
|
+
@variant = options.fetch(:variant, [])
|
22
|
+
end
|
23
|
+
|
24
|
+
# Screenings at a single cinema
|
25
|
+
# @param [String] cinema_id the id of the cinema
|
26
|
+
# @return [Array<PicturehouseUk::Screening>]
|
27
|
+
def self.at(cinema_id)
|
28
|
+
cinema_page(cinema_id).film_html.map do |html|
|
29
|
+
create_for_single_film(html, cinema_id)
|
30
|
+
end.flatten
|
31
|
+
end
|
32
|
+
|
33
|
+
# The UTC time of the screening
|
34
|
+
# @return [Time]
|
35
|
+
def showing_at
|
36
|
+
@showing_at ||= begin
|
37
|
+
if @time.utc?
|
38
|
+
@time
|
39
|
+
else
|
40
|
+
TZInfo::Timezone.get('Europe/London').local_to_utc(@time)
|
41
|
+
end
|
42
|
+
end
|
22
43
|
end
|
23
44
|
|
24
45
|
# The date of the screening
|
25
46
|
# @return [Date]
|
26
|
-
def
|
27
|
-
|
47
|
+
def showing_on
|
48
|
+
showing_at.to_date
|
49
|
+
end
|
50
|
+
|
51
|
+
# The kinds of screening
|
52
|
+
# @return <Array[String]>
|
53
|
+
def variant
|
54
|
+
@variant.map(&:downcase).sort
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def self.cinema_hash(cinema_id)
|
60
|
+
{
|
61
|
+
cinema_id: cinema_id,
|
62
|
+
cinema_name: PicturehouseUk::Cinema.find(cinema_id).name
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.cinema_page(cinema_id)
|
67
|
+
PicturehouseUk::Internal::CinemaPage.new(cinema_id)
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.create_for_single_film(html, cinema_id)
|
71
|
+
screenings_parser(html).to_a.map do |attributes|
|
72
|
+
new cinema_hash(cinema_id).merge(attributes)
|
73
|
+
end
|
28
74
|
end
|
29
75
|
|
30
|
-
|
31
|
-
|
32
|
-
warn "Please use #variant instead, I can't spell"
|
33
|
-
variant
|
76
|
+
def self.screenings_parser(html)
|
77
|
+
PicturehouseUk::Internal::FilmWithScreeningsParser.new(html)
|
34
78
|
end
|
35
79
|
end
|
36
80
|
end
|
data/picturehouse_uk.gemspec
CHANGED
@@ -4,25 +4,25 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'picturehouse_uk/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
7
|
+
spec.name = 'picturehouse_uk'
|
8
8
|
spec.version = PicturehouseUk::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
11
|
-
spec.description =
|
12
|
-
spec.summary =
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
9
|
+
spec.authors = ['Andy Croll']
|
10
|
+
spec.email = ['andy@goodscary.com']
|
11
|
+
spec.description = 'An API to pull movie information from the picturehouse.co.uk website'
|
12
|
+
spec.summary = "It's a scraper, but a nice one"
|
13
|
+
spec.homepage = ''
|
14
|
+
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
19
|
+
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
22
|
+
spec.add_development_dependency 'minitest-reporters'
|
23
|
+
spec.add_development_dependency 'rake'
|
23
24
|
spec.add_development_dependency 'webmock'
|
24
25
|
|
25
|
-
spec.add_runtime_dependency 'httparty'
|
26
26
|
spec.add_runtime_dependency 'nokogiri'
|
27
27
|
spec.add_runtime_dependency 'tzinfo'
|
28
28
|
spec.add_runtime_dependency 'tzinfo-data'
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.expand_path('../../lib/picturehouse_uk.rb', __FILE__)
|
2
|
+
|
3
|
+
def fixture(name)
|
4
|
+
File.expand_path("../fixtures/#{name}.html", __FILE__)
|
5
|
+
end
|
6
|
+
|
7
|
+
File.open(fixture('home'), 'w') do |file|
|
8
|
+
puts '* Homepage'
|
9
|
+
file.write PicturehouseUk::Internal::Website.new.home
|
10
|
+
end
|
11
|
+
|
12
|
+
# DUKE OF YORKS
|
13
|
+
|
14
|
+
File.open(fixture('cinema/Duke_Of_Yorks'), 'w') do |file|
|
15
|
+
puts '* Duke of Yorks'
|
16
|
+
file.write PicturehouseUk::Internal::Website.new.cinema('Duke_Of_Yorks')
|
17
|
+
end
|
18
|
+
|
19
|
+
File.open(fixture('contact_us/Duke_Of_Yorks'), 'w') do |file|
|
20
|
+
puts '* Duke of Yorks Information'
|
21
|
+
file.write PicturehouseUk::Internal::Website.new.contact_us('Duke_Of_Yorks')
|
22
|
+
end
|
23
|
+
|
24
|
+
# KOMEDIA
|
25
|
+
|
26
|
+
File.open(fixture('contact_us/Dukes_At_Komedia'), 'w') do |file|
|
27
|
+
puts '* Dukes at Komedia Information'
|
28
|
+
file.write PicturehouseUk::Internal::Website.new.contact_us('Dukes_At_Komedia')
|
29
|
+
end
|
30
|
+
|
31
|
+
# FILMS
|
32
|
+
|
33
|
+
page = PicturehouseUk::Internal::CinemaPage.new('Duke_Of_Yorks')
|
34
|
+
|
35
|
+
File.open(fixture('cinema/Duke_Of_Yorks/film_second'), 'w') do |file|
|
36
|
+
puts '* Duke of Yorks Second Film'
|
37
|
+
file.write page.film_html[1]
|
38
|
+
end
|
39
|
+
|
40
|
+
File.open(fixture('cinema/Duke_Of_Yorks/film_last'), 'w') do |file|
|
41
|
+
puts '* Duke of Yorks Last Film'
|
42
|
+
file.write page.film_html[-1]
|
43
|
+
end
|