serienrenamer 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,181 +0,0 @@
1
- #
2
- # Class that extracts information about episodes
3
- # from the serienjunkies.org-Page
4
- #
5
- require 'uri'
6
- require 'mechanize'
7
- require 'yaml'
8
-
9
- module Serienrenamer
10
- module Plugin
11
-
12
- class SerienjunkiesOrg < Serienrenamer::Pluginbase
13
-
14
- def self.plugin_name; "SerienjunkiesOrg" end
15
- def self.plugin_url; "http://serienjunkies.org" end
16
- def self.usable; true end
17
- def self.priority; 60 end
18
-
19
- # Public: tries to search for an appropriate episodename
20
- #
21
- # if this is the first call to this method, it builds up
22
- # a hash with all series and existing episodes, which can
23
- # be used by all future method calls
24
- #
25
- # episode - Serienrenamer::Episode instance which holds the information
26
- #
27
- # Returns an array of possible episodenames
28
- def self.generate_episode_information(episode)
29
-
30
- raise ArgumentError, "Serienrenamer::Episode instance needed" unless
31
- episode.is_a?(Serienrenamer::Episode)
32
-
33
- unless defined? @cached_data
34
- @cached_data = Hash.new
35
- end
36
-
37
- if ! @cached_data.has_key?(episode.series)
38
-
39
- if episode.series.match(/\w+/)
40
-
41
- # determine link to series
42
- seriespage_link = self.find_link_to_series_page(episode.series)
43
-
44
- if seriespage_link
45
- seriesdata = self.parse_seriespage(seriespage_link)
46
-
47
- @cached_data[episode.series] = seriesdata
48
- end
49
- end
50
- end
51
-
52
- matched_episodes = []
53
-
54
- # tries to find an episodename in cached_data
55
- # otherwise returns empty array
56
- begin
57
- series = @cached_data[episode.series]
58
-
59
- identifier = "%d_%d" % [ episode.season, episode.episode ]
60
- episodename = series[identifier]
61
-
62
- if episodename.match(/\w+/)
63
- matched_episodes.push(episodename)
64
- end
65
- rescue
66
- end
67
-
68
- return matched_episodes
69
- end
70
-
71
- # Public: tries to find a link to the seriespage
72
- #
73
- # seriesname - the series name for which the page is searched
74
- #
75
- # Returns the link or nil
76
- def self.find_link_to_series_page(seriesname)
77
- raise ArgumentError, "seriesname expected" unless seriesname.match(/\w+/)
78
-
79
- self.build_agent unless defined? @agent
80
-
81
- url = URI.join(plugin_url, "?cat=0&l=%s" % seriesname[0].downcase )
82
-
83
- pattern = seriesname.gsub(/\s/, ".*")
84
-
85
- @agent.get(url).search("div#sidebar > ul > li > a").each do |series|
86
- if series.text.match(/#{pattern}/i)
87
- return URI.join( plugin_url, series[:href]).to_s
88
- end
89
- end
90
-
91
- nil
92
- end
93
-
94
- # Public: parses a series page and extracts the episode information
95
- #
96
- # page_url - the url to the seriespage
97
- # german - if true it extracts only german data (Defaults to true)
98
- #
99
- # Returns a hash which contains the episode information or an empty
100
- # hash if there aren't any episodes
101
- def self.parse_seriespage(page_url, german=true, debug=false)
102
-
103
- self.build_agent unless defined? @agent
104
-
105
- series = {}
106
- doc = @agent.get(page_url)
107
-
108
- doc.search('div#sidebar > div#scb > div.bkname > a').each do |link|
109
- if german
110
- next unless link.content.match(/Staffel/i)
111
- else
112
- next unless link.content.match(/Season/i)
113
- end
114
-
115
- site = @agent.get(link[:href])
116
- episodes = self.parse_season_subpage(site, german)
117
-
118
- series.merge!(episodes)
119
- end
120
-
121
- puts series.to_yaml if debug
122
-
123
- return series
124
- end
125
-
126
- # Public: extracts the episodes from one season
127
- #
128
- # page - Mechanize page object which holds the season
129
- # german - extracts german or international episodes
130
- #
131
- # Returns a hash with all episodes (unique)
132
- def self.parse_season_subpage(page, german=true)
133
-
134
- episodes = {}
135
-
136
- page.search('div.post > div.post-content strong:nth-child(1)').each do |e|
137
-
138
- content = e.content
139
- md = Serienrenamer::Episode.extract_episode_information(content)
140
- next unless md
141
-
142
- if german
143
- next unless content.match(/German/i)
144
- next if content.match(/Subbed/i)
145
- else
146
- next if content.match(/German/i)
147
- end
148
-
149
- episodename =
150
- Serienrenamer::Episode.clean_episode_data(md[:episodename], true)
151
- next unless episodename && episodename.match(/\w+/)
152
-
153
- id = "%d_%d" % [ md[:season].to_i, md[:episode].to_i ]
154
-
155
- next if episodes[id] && episodes[id].size > episodename.size
156
-
157
- episodes[id] = episodename
158
-
159
- end
160
-
161
- return episodes
162
- end
163
-
164
- private
165
-
166
- # Private: constructs a Mechanize instance and adds a fix that interprets
167
- # every response as html
168
- #
169
- # Returns the agent
170
- def self.build_agent
171
- @agent = Mechanize.new do |a|
172
- a.post_connect_hooks << lambda do |_,_,response,_|
173
- if response.content_type.nil? || response.content_type.empty?
174
- response.content_type = 'text/html'
175
- end
176
- end
177
- end
178
- end
179
- end
180
- end
181
- end
@@ -1,448 +0,0 @@
1
- # encoding: UTF-8
2
- require 'media_wiki'
3
-
4
- module Serienrenamer
5
- module Plugin
6
-
7
- # This Plugin tries to extract the series
8
- # information from wikipedia
9
- #
10
- # (by now only the german wikipedia)
11
- class Wikipedia < Serienrenamer::Pluginbase
12
-
13
- def self.plugin_name; "Wikipedia" end
14
- def self.usable; true end
15
- def self.priority; 30 end
16
-
17
- @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
18
-
19
- # patterns used in this class
20
- @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
21
- @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
22
- @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
23
- @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
24
- @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
25
- @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
26
- @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
27
- @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
28
- @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
29
-
30
-
31
- # this method will be called from the main program
32
- # with an Serienrenamer::Episode instance as parameter
33
- #
34
- # it returns an array of episode information
35
- def self.generate_episode_information(episode)
36
-
37
- raise ArgumentError, "Serienrenamer::Episode instance needed" unless
38
- episode.is_a?(Serienrenamer::Episode)
39
-
40
- return [] unless episode.series.match(/\w+/)
41
-
42
- unless defined? @cached_data
43
- @cached_data = Hash.new
44
- end
45
-
46
- wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
47
-
48
- if ! @cached_data.has_key?(episode.series)
49
- # search for a series site in wikipedia
50
- series_site = nil
51
- tries = 3
52
- search_pattern = episode.series
53
- search_pattern_modified = false
54
-
55
- begin
56
- wiki.search(search_pattern, nil, 15).each do |title|
57
- pagedata = wiki.get(title)
58
- if is_series_main_page?(pagedata)
59
- series_site = title
60
- break
61
- end
62
- end
63
-
64
- # modify the search term pattern so that it contains
65
- # only the last word if the search_pattern contains
66
- # more than one words
67
- if series_site.nil? && ! search_pattern_modified
68
- search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
69
- search_pattern_modified = true
70
- raise EOFError if search_pattern # break out and retry
71
- end
72
- rescue MediaWiki::APIError => e
73
- tries -= 1
74
- retry if tries > 0
75
- rescue EOFError => e
76
- retry
77
- end
78
-
79
- return [] unless series_site
80
-
81
- # look for a link to a list of episodes
82
- pagedata = wiki.get(series_site)
83
-
84
- if contains_link_to_episode_list?(pagedata)
85
- mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
86
- if mainarticle
87
- episodelist_page = wiki.get(mainarticle)
88
- series = parse_episodelist_page_data(episodelist_page)
89
-
90
- @cached_data[episode.series] = series
91
- end
92
-
93
- elsif contains_inarticle_episode_list?(pagedata)
94
- series = parse_inarticle_episodelist_page_data(pagedata)
95
- @cached_data[episode.series] = series
96
-
97
- else
98
- warn "no episode list found"
99
- return []
100
- end
101
- end
102
-
103
- episode_names = []
104
-
105
- # tries to find an episodename in cached_data
106
- # otherwise returns empty array
107
- begin
108
- series = @cached_data[episode.series]
109
- episodename = series[episode.season][episode.episode]
110
- if episodename.match(/\w+/)
111
- episode_names.push(episodename)
112
- end
113
- rescue
114
- end
115
-
116
- return episode_names
117
- end
118
-
119
-
120
- # This method will extract season based information
121
- # from a string that contains a wikipedia episodelist page
122
- #
123
- # returns an Array of Arrays with episode information
124
- # where episode and season numbers are the indizes
125
- def self.parse_episodelist_page_data(pagedata, debug=false)
126
- raise ArgumentError, 'String with pagedata expected' unless
127
- pagedata.is_a?(String)
128
-
129
- series_data = []
130
- is_season_table_following = false
131
- season_number = nil
132
-
133
- # split the wikipedia page by headings and process
134
- # the following paragraph if the heading starts with
135
- # 'Staffel'
136
- pagedata.split(/(==.*)==/).each do |paragraph|
137
- if paragraph.match(/^==.*Staffel/)
138
- match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
139
- if match
140
- season_number = match[:seasonnr].to_i
141
- is_season_table_following = true
142
- end
143
- elsif is_season_table_following
144
- #
145
- # extract season table from this paragraph
146
- season = parse_season_table(paragraph)
147
-
148
- series_data[season_number] = season
149
- is_season_table_following = false
150
- end
151
- end
152
-
153
- return series_data
154
- end
155
-
156
-
157
- # this method will be called with a wikipedia seasontable
158
- # as parameter and will extract all episodes from this
159
- # and returns that as an array where the episode number is
160
- # the index
161
- def self.parse_season_table(table)
162
- raise ArgumentError, 'String with seasontable expected' unless
163
- table.is_a?(String)
164
-
165
- season_data = []
166
-
167
- matched_table = table.match(@@EPISODE_TABLE_PATTERN)
168
- if matched_table
169
-
170
- # extract all episode entries that
171
- # looks like the following
172
- #
173
- # {{Episodenlisteneintrag
174
- # | NR_GES = 107
175
- # | NR_ST = 1
176
- # | OT = The Mastodon in the Room
177
- # | DT = Die Rückkehr der Scheuklappen
178
- # | ZF =
179
- # | EA = {{dts|23|09|2010}}
180
- # | EAD = {{dts|08|09|2011}}
181
- # }}
182
-
183
- episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
184
- if episodes
185
- episodes.each do |epi|
186
-
187
- # build up a hash from the entry
188
- infos = {}
189
- epi.lines.each do |part|
190
- parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
191
- if parts
192
- infos[parts[:key].strip] = parts[:value].strip
193
- end
194
- end
195
-
196
- next unless infos.has_key?('NR_ST')
197
-
198
- # extract useful information and
199
- # add it to the array
200
- epi_nr = infos['NR_ST'].to_i
201
- next unless epi_nr
202
-
203
- # TODO make the following variable
204
- epi_name = infos['DT'].strip
205
-
206
- # remove all html tags and all following
207
- # text from the episode name and the bold
208
- # syntax from mediawiki [[text]]
209
- epi_name.gsub!(/<\/?[^>]*>.*/, "")
210
- epi_name.gsub!(/[\[\[\]\]]/, "")
211
- next unless epi_name.match(/\w+/)
212
-
213
- season_data[epi_nr] = epi_name
214
- end
215
- end
216
- end
217
- return season_data
218
- end
219
-
220
-
221
- # This method will extract season based information
222
- # from a string that contains a series page with an
223
- # episodelist included
224
- #
225
- # returns an Array of Arrays with episode information
226
- # where episode and season numbers are the indizes
227
- def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
228
- raise ArgumentError, 'String with pagedata expected' unless
229
- pagedata.is_a?(String)
230
-
231
- series_data = []
232
-
233
- # look for a paragraph with an episodelist
234
- episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
235
- contains_inarticle_episode_list?(p) }[0]
236
-
237
- raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
238
-
239
- # iterate through all seasons in this episode table
240
- episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
241
- next unless contains_inarticle_episode_list?(season)
242
-
243
- season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
244
-
245
- wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
246
-
247
- # we have to detect the type of the inarticle season page
248
- # because there are two different kinds of table structures
249
- # used in the german wikipedia
250
- if self.is_episode_list_with_one_episode_per_line?(wikitable)
251
- episodes = parse_inarticle_season_table_with_one_line(wikitable)
252
- else
253
- episodes = parse_inarticle_season_table(wikitable)
254
- end
255
-
256
- # HACK if a season is splitted into different parts
257
- # eg. Flashpoint (2.1 and 2.2) than merge that if possible
258
- if series_data[season_nr] != nil
259
- series_data[season_nr].each_with_index do |item, index|
260
- episodes[index] = item unless episodes[index]
261
- end
262
- end
263
-
264
- series_data[season_nr] = episodes
265
- end
266
-
267
- return series_data
268
- end
269
-
270
-
271
- # this method will be called with a wikitable for a season
272
- # as parameter and will extract all episodes from this
273
- # and returns that as an array where the episode number is
274
- # the index
275
- #
276
- # Example for an wikitable for episodes:
277
- #
278
- # {| class="wikitable" width="100%"
279
- # |- vertical-align: top; text-align:center; "
280
- # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
281
- # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
282
- # ! width="250" | Originaltitel
283
- # ! width="250" | Deutscher Titel
284
- # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
285
- # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
286
- # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
287
- # |-
288
- # | bgcolor="#DFEEEF"| 01
289
- # | 01
290
- # | ''Pilot''
291
- # | ''Auch Reiche sind nur Menschen''
292
- # | 4. Mai 2009
293
- # | 17. Mai 2011
294
- # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
295
- # |-
296
- # |}
297
- #
298
- def self.parse_inarticle_season_table(table)
299
- raise ArgumentError, 'String with seasontable expected' unless
300
- table.is_a?(String)
301
-
302
- season_data = []
303
- episode_nr_line_nr = nil
304
- episode_name_line_nr = nil
305
-
306
- table.split(/^\|\-.*$/).each do |tablerow|
307
- tablerow.strip!
308
-
309
- # skip invalid rows
310
- lines = tablerow.lines.to_a
311
- next unless lines.length >= 4
312
-
313
- if tablerow.match(/width=\"\d+\"/)
314
- # extract line numbers for needed data that
315
- # are in the table header
316
- lines.each_with_index do |item, index|
317
- if item.match(/Nummer.*Staffel/i)
318
- episode_nr_line_nr = index
319
-
320
- # TODO make the following more variable
321
- elsif item.match(/Deutscher.*Titel/i)
322
- episode_name_line_nr = index
323
- end
324
- end
325
- else
326
- # extract episode information
327
- if episode_nr_line_nr && episode_name_line_nr
328
-
329
- md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
330
- if md_nr
331
- episode_nr = md_nr[1].to_i
332
-
333
- md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
334
- if md_name
335
- episode_name = md_name[1]
336
- episode_name.gsub!(/[\'\"\[\]]/, "")
337
- next unless episode_name.match(/\w+/)
338
-
339
- season_data[episode_nr] = episode_name.strip
340
- end
341
- end
342
- end
343
- end
344
- end
345
-
346
- return season_data
347
- end
348
-
349
-
350
- # this method will be called with a wikitable for a season
351
- # as parameter and will extract all episodes from this
352
- # and returns that as an array where the episode number is
353
- # the index
354
- #
355
- # this method implements a special format that takes place in
356
- # e.g. 'Prison Break' where an episode is not spread along several
357
- # lines like in the method above
358
- #
359
- # Example for an wikitable for episodes:
360
- #
361
- #{| class="wikitable"
362
- # |- style="color:#black; background-color:#006699"
363
- # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
364
- # |-
365
- # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
366
- # |-
367
- # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
368
- # |-
369
- # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
370
- # |-
371
- # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
372
- #
373
- def self.parse_inarticle_season_table_with_one_line(table)
374
- raise ArgumentError, 'String with seasontable expected' unless
375
- table.is_a?(String)
376
-
377
- season_data = []
378
- episode_nr_col = nil
379
- episode_name_col = nil
380
-
381
- table.split(/^\|\-.*$/).each do |tablerow|
382
-
383
- if tablerow.match(/!!.*!!.*!!/)
384
- # extract column numbers from table header
385
- tablerow.split(/!!/).each_with_index do |col,index|
386
- episode_nr_col = index if col.match(/Episode/i)
387
- episode_name_col = index if col.match(/Deutsch.*Titel/i)
388
- end
389
-
390
- elsif tablerow.match(/\|\|.*\w+.*\|\|/)
391
- tablerow.strip!
392
- columns = tablerow.split(/\|\|/)
393
-
394
- # the following cleanes up the column so that the following occurs
395
- # " '''7''' (1-07) " => "7 1 07"
396
- #
397
- # we can now extract the last bunch of digits and this algorithm is
398
- # some kind of format independent
399
- dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip
400
- episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
401
- next unless episode_nr
402
-
403
- episode_name = columns[episode_name_col].strip
404
- next unless episode_nr.match(/\w+/)
405
-
406
- season_data[episode_nr.to_i] = episode_name
407
- end
408
- end
409
-
410
- return season_data
411
- end
412
-
413
-
414
- # this method checks if the page is the main page
415
- # for a series
416
- #
417
- # returns true if page contains the infobox that
418
- # is typical for series pages in wikipedia
419
- def self.is_series_main_page?(page)
420
- page.match(@@SERIES_SITE_TEST_PATTERN) != nil
421
- end
422
-
423
- # check the site if it is a disambiguation site
424
- #
425
- # returns true if this site links to pages with
426
- # themes with the same name
427
- def self.is_disambiguation_site?(page)
428
- page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
429
- end
430
-
431
- # test if the page contains a link to an article
432
- # with an episode list
433
- def self.contains_link_to_episode_list?(page)
434
- page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
435
- end
436
-
437
- # test if the page contains a episode list
438
- def self.contains_inarticle_episode_list?(page)
439
- page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
440
- end
441
-
442
- # tests for the type of in article episode list
443
- def self.is_episode_list_with_one_episode_per_line?(page)
444
- page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
445
- end
446
- end
447
- end
448
- end