yayimdbs 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/yay_imdbs.rb +30 -10
  2. metadata +3 -3
data/lib/yay_imdbs.rb CHANGED
@@ -18,6 +18,13 @@ class YayImdbs
18
18
 
19
19
  STRIP_WHITESPACE = /(\s{2,}|\n|\||\302\240\302\273)/u
20
20
 
21
+ MORE_INFO_LINKS = ['See more',
22
+ 'Add/edit official sites',
23
+ 'See all certifications',
24
+ 'See full summary',
25
+ 'see all',
26
+ ]
27
+
21
28
  DATE_PROPERTIES = [:release_date]
22
29
  LIST_PROPERTIES = [:genres, :plot_keywords, :country, :sound_mix, :language]
23
30
  INT_LIST_PROPERTIES = [:year, :season]
@@ -26,7 +33,8 @@ class YayImdbs
26
33
  :year => :years,
27
34
  :season => :seasons,
28
35
  :language => :languages,
29
- :motion_picture_rating_mpaa => :mpaa}
36
+ :motion_picture_rating_mpaa => :mpaa,
37
+ :official_sites => :official_site}
30
38
 
31
39
  class << self
32
40
 
@@ -95,7 +103,7 @@ class YayImdbs
95
103
  movie_properties(doc) do |key, value|
96
104
  found_info_divs = true
97
105
  info_hash["raw_#{key}"] = value
98
- info_hash[key] = clean_movie_property(key, value)
106
+ info_hash[key] = clean_movie_property(key, value, imdb_id)
99
107
  info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key]
100
108
  end
101
109
 
@@ -115,7 +123,7 @@ class YayImdbs
115
123
  return info_hash
116
124
  end
117
125
 
118
- def clean_movie_property(key, value)
126
+ def clean_movie_property(key, value, imdb_id)
119
127
  if DATE_PROPERTIES.include?(key)
120
128
  value = Date.strptime(value, '%d %B %Y') rescue nil
121
129
  elsif key == :runtime
@@ -124,6 +132,8 @@ class YayImdbs
124
132
  else
125
133
  value = nil
126
134
  end
135
+ elsif key == :official_sites
136
+ value = get_official_site_url(value, imdb_id)
127
137
  elsif LIST_PROPERTIES.include?(key)
128
138
  value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') }
129
139
  elsif INT_LIST_PROPERTIES.include?(key)
@@ -138,15 +148,21 @@ class YayImdbs
138
148
  raw_key = h4.inner_text
139
149
  key = raw_key.sub(':', '').strip.downcase
140
150
  value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1]
141
- value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)|(Add\/edit official sites)|(See full summary)$/, '').strip
151
+ value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(#{MORE_INFO_LINKS.join(')|(')})$/i, '').strip
142
152
  symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym
143
- if symbol_key == :official_sites
144
- value = div.inner_html.match(/href=\"(.*?)\"/)[1]
145
- end
146
153
  yield symbol_key, value
147
154
  end
148
155
  end
149
156
 
157
+ # TODO capture all official sites, not all sites have an "Official site" link (e.g. Lost)
158
+ def get_official_site_url(value, imdb_id)
159
+ value = value.match(/<a href="(.*?)">Official site<\/a>/)
160
+ if value.nil?
161
+ value = get_official_sites_page(imdb_id).inner_html.match(/<a href="(.*?)">Official site<\/a>/)
162
+ end
163
+ return $1
164
+ end
165
+
150
166
  def scrap_images(doc, info_hash)
151
167
  #scrap poster image urls
152
168
  thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src')
@@ -174,9 +190,9 @@ class YayImdbs
174
190
 
175
191
  raw_date = e_div.at_css('strong').inner_text.strip
176
192
  episode['date'] = Date.parse(raw_date) rescue nil
177
- if e_div.inner_text =~ /#{raw_date}/
178
- episode['plot'] = $'.strip
179
- end
193
+
194
+ # Seems that the day can sometimes be ???? which doesnt play will with regex
195
+ episode['plot'] = $'.strip if e_div.inner_text =~ /#{raw_date}/ rescue nil
180
196
 
181
197
  episodes << episode
182
198
  end
@@ -190,6 +206,10 @@ class YayImdbs
190
206
  def get_movie_page(imdb_id)
191
207
  Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id))
192
208
  end
209
+
210
+ def get_official_sites_page(imdb_id)
211
+ Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/officialsites' ))
212
+ end
193
213
 
194
214
  def get_episodes_page(imdb_id)
195
215
  Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes'))
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 3
9
- version: 0.2.3
8
+ - 4
9
+ version: 0.2.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Sam Cavenagh
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-07-28 00:00:00 +10:00
17
+ date: 2011-08-02 00:00:00 +10:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency