yayimdbs 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/yay_imdbs.rb +30 -10
- metadata +3 -3
data/lib/yay_imdbs.rb
CHANGED
@@ -18,6 +18,13 @@ class YayImdbs
|
|
18
18
|
|
19
19
|
STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
|
20
20
|
|
21
|
+
MORE_INFO_LINKS = ['See more',
|
22
|
+
'Add/edit official sites',
|
23
|
+
'See all certifications',
|
24
|
+
'See full summary',
|
25
|
+
'see all',
|
26
|
+
]
|
27
|
+
|
21
28
|
DATE_PROPERTIES = [:release_date]
|
22
29
|
LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
|
23
30
|
INT_LIST_PROPERTIES = [:year, :season]
|
@@ -26,7 +33,8 @@ class YayImdbs
|
|
26
33
|
:year => :years,
|
27
34
|
:season => :seasons,
|
28
35
|
:language => :languages,
|
29
|
-
:motion_picture_rating_mpaa => :mpaa
|
36
|
+
:motion_picture_rating_mpaa => :mpaa,
|
37
|
+
:official_sites => :official_site}
|
30
38
|
|
31
39
|
class << self
|
32
40
|
|
@@ -95,7 +103,7 @@ class YayImdbs
|
|
95
103
|
movie_properties(doc) do |key, value|
|
96
104
|
found_info_divs = true
|
97
105
|
info_hash["raw_#{key}"] = value
|
98
|
-
info_hash[key] = clean_movie_property(key, value)
|
106
|
+
info_hash[key] = clean_movie_property(key, value, imdb_id)
|
99
107
|
info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
|
100
108
|
end
|
101
109
|
|
@@ -115,7 +123,7 @@ class YayImdbs
|
|
115
123
|
return info_hash
|
116
124
|
end
|
117
125
|
|
118
|
-
def clean_movie_property(key, value)
|
126
|
+
def clean_movie_property(key, value, imdb_id)
|
119
127
|
if DATE_PROPERTIES.include?(key)
|
120
128
|
value = Date.strptime(value, '%d %B %Y') rescue nil
|
121
129
|
elsif key == :runtime
|
@@ -124,6 +132,8 @@ class YayImdbs
|
|
124
132
|
else
|
125
133
|
value = nil
|
126
134
|
end
|
135
|
+
elsif key == :official_sites
|
136
|
+
value = get_official_site_url(value, imdb_id)
|
127
137
|
elsif LIST_PROPERTIES.include?(key)
|
128
138
|
value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
|
129
139
|
elsif INT_LIST_PROPERTIES.include?(key)
|
@@ -138,15 +148,21 @@ class YayImdbs
|
|
138
148
|
raw_key = h4.inner_text
|
139
149
|
key = raw_key.sub(':', '').strip.downcase
|
140
150
|
value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
|
141
|
-
value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(
|
151
|
+
value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(#{MORE_INFO_LINKS.join(')|(')})$/i, '').strip
|
142
152
|
symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
|
143
|
-
if symbol_key == :official_sites
|
144
|
-
value = div.inner_html.match(/href=\"(.*?)\"/)[1]
|
145
|
-
end
|
146
153
|
yield symbol_key, value
|
147
154
|
end
|
148
155
|
end
|
149
156
|
|
157
|
+
# TODO capture all official sites, not all sites have an "Official site" link (e.g. Lost)
|
158
|
+
def get_official_site_url(value, imdb_id)
|
159
|
+
value = value.match(/<a href="(.*?)">Official site<\/a>/)
|
160
|
+
if value.nil?
|
161
|
+
value = get_official_sites_page(imdb_id).inner_html.match(/<a href="(.*?)">Official site<\/a>/)
|
162
|
+
end
|
163
|
+
return $1
|
164
|
+
end
|
165
|
+
|
150
166
|
def scrap_images(doc, info_hash)
|
151
167
|
#scrap poster image urls
|
152
168
|
thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
|
@@ -174,9 +190,9 @@ class YayImdbs
|
|
174
190
|
|
175
191
|
raw_date = e_div.at_css('strong').inner_text.strip
|
176
192
|
episode['date'] = Date.parse(raw_date) rescue nil
|
177
|
-
|
178
|
-
|
179
|
-
|
193
|
+
|
194
|
+
# Seems that the day can sometimes be ???? which doesnt play will with regex
|
195
|
+
episode['plot'] = $'.strip if e_div.inner_text =~ /#{raw_date}/ rescue nil
|
180
196
|
|
181
197
|
episodes << episode
|
182
198
|
end
|
@@ -190,6 +206,10 @@ class YayImdbs
|
|
190
206
|
def get_movie_page(imdb_id)
|
191
207
|
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
|
192
208
|
end
|
209
|
+
|
210
|
+
def get_official_sites_page(imdb_id)
|
211
|
+
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/officialsites' ))
|
212
|
+
end
|
193
213
|
|
194
214
|
def get_episodes_page(imdb_id)
|
195
215
|
Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 4
|
9
|
+
version: 0.2.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Sam Cavenagh
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-08-02 00:00:00 +10:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|