serienrenamer 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,181 +0,0 @@
1
- #
2
- # Class that extracts information about episodes
3
- # from the serienjunkies.org-Page
4
- #
5
- require 'uri'
6
- require 'mechanize'
7
- require 'yaml'
8
-
9
- module Serienrenamer
10
- module Plugin
11
-
12
- class SerienjunkiesOrg < Serienrenamer::Pluginbase
13
-
14
- def self.plugin_name; "SerienjunkiesOrg" end
15
- def self.plugin_url; "http://serienjunkies.org" end
16
- def self.usable; true end
17
- def self.priority; 60 end
18
-
19
- # Public: tries to search for an appropriate episodename
20
- #
21
- # if this is the first call to this method, it builds up
22
- # a hash with all series and existing episodes, which can
23
- # be used by all future method calls
24
- #
25
- # episode - Serienrenamer::Episode instance which holds the information
26
- #
27
- # Returns an array of possible episodenames
28
- def self.generate_episode_information(episode)
29
-
30
- raise ArgumentError, "Serienrenamer::Episode instance needed" unless
31
- episode.is_a?(Serienrenamer::Episode)
32
-
33
- unless defined? @cached_data
34
- @cached_data = Hash.new
35
- end
36
-
37
- if ! @cached_data.has_key?(episode.series)
38
-
39
- if episode.series.match(/\w+/)
40
-
41
- # determine link to series
42
- seriespage_link = self.find_link_to_series_page(episode.series)
43
-
44
- if seriespage_link
45
- seriesdata = self.parse_seriespage(seriespage_link)
46
-
47
- @cached_data[episode.series] = seriesdata
48
- end
49
- end
50
- end
51
-
52
- matched_episodes = []
53
-
54
- # tries to find an episodename in cached_data
55
- # otherwise returns empty array
56
- begin
57
- series = @cached_data[episode.series]
58
-
59
- identifier = "%d_%d" % [ episode.season, episode.episode ]
60
- episodename = series[identifier]
61
-
62
- if episodename.match(/\w+/)
63
- matched_episodes.push(episodename)
64
- end
65
- rescue
66
- end
67
-
68
- return matched_episodes
69
- end
70
-
71
- # Public: tries to find a link to the seriespage
72
- #
73
- # seriesname - the series name for which the page is searched
74
- #
75
- # Returns the link or nil
76
- def self.find_link_to_series_page(seriesname)
77
- raise ArgumentError, "seriesname expected" unless seriesname.match(/\w+/)
78
-
79
- self.build_agent unless defined? @agent
80
-
81
- url = URI.join(plugin_url, "?cat=0&l=%s" % seriesname[0].downcase )
82
-
83
- pattern = seriesname.gsub(/\s/, ".*")
84
-
85
- @agent.get(url).search("div#sidebar > ul > li > a").each do |series|
86
- if series.text.match(/#{pattern}/i)
87
- return URI.join( plugin_url, series[:href]).to_s
88
- end
89
- end
90
-
91
- nil
92
- end
93
-
94
- # Public: parses a series page and extracts the episode information
95
- #
96
- # page_url - the url to the seriespage
97
- # german - if true it extracts only german data (Defaults to true)
98
- #
99
- # Returns a hash which contains the episode information or an empty
100
- # hash if there aren't any episodes
101
- def self.parse_seriespage(page_url, german=true, debug=false)
102
-
103
- self.build_agent unless defined? @agent
104
-
105
- series = {}
106
- doc = @agent.get(page_url)
107
-
108
- doc.search('div#sidebar > div#scb > div.bkname > a').each do |link|
109
- if german
110
- next unless link.content.match(/Staffel/i)
111
- else
112
- next unless link.content.match(/Season/i)
113
- end
114
-
115
- site = @agent.get(link[:href])
116
- episodes = self.parse_season_subpage(site, german)
117
-
118
- series.merge!(episodes)
119
- end
120
-
121
- puts series.to_yaml if debug
122
-
123
- return series
124
- end
125
-
126
- # Public: extracts the episodes from one season
127
- #
128
- # page - Mechanize page object which holds the season
129
- # german - extracts german or international episodes
130
- #
131
- # Returns a hash with all episodes (unique)
132
- def self.parse_season_subpage(page, german=true)
133
-
134
- episodes = {}
135
-
136
- page.search('div.post > div.post-content strong:nth-child(1)').each do |e|
137
-
138
- content = e.content
139
- md = Serienrenamer::Episode.extract_episode_information(content)
140
- next unless md
141
-
142
- if german
143
- next unless content.match(/German/i)
144
- next if content.match(/Subbed/i)
145
- else
146
- next if content.match(/German/i)
147
- end
148
-
149
- episodename =
150
- Serienrenamer::Episode.clean_episode_data(md[:episodename], true)
151
- next unless episodename && episodename.match(/\w+/)
152
-
153
- id = "%d_%d" % [ md[:season].to_i, md[:episode].to_i ]
154
-
155
- next if episodes[id] && episodes[id].size > episodename.size
156
-
157
- episodes[id] = episodename
158
-
159
- end
160
-
161
- return episodes
162
- end
163
-
164
- private
165
-
166
- # Private: constructs a Mechanize instance and adds a fix that interprets
167
- # every response as html
168
- #
169
- # Returns the agent
170
- def self.build_agent
171
- @agent = Mechanize.new do |a|
172
- a.post_connect_hooks << lambda do |_,_,response,_|
173
- if response.content_type.nil? || response.content_type.empty?
174
- response.content_type = 'text/html'
175
- end
176
- end
177
- end
178
- end
179
- end
180
- end
181
- end
@@ -1,448 +0,0 @@
1
- # encoding: UTF-8
2
- require 'media_wiki'
3
-
4
- module Serienrenamer
5
- module Plugin
6
-
7
- # This Plugin tries to extract the series
8
- # information from wikipedia
9
- #
10
- # (by now only the german wikipedia)
11
- class Wikipedia < Serienrenamer::Pluginbase
12
-
13
- def self.plugin_name; "Wikipedia" end
14
- def self.usable; true end
15
- def self.priority; 30 end
16
-
17
- @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
18
-
19
- # patterns used in this class
20
- @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
21
- @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
22
- @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
23
- @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
24
- @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
25
- @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
26
- @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
27
- @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
28
- @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
29
-
30
-
31
- # this method will be called from the main program
32
- # with an Serienrenamer::Episode instance as parameter
33
- #
34
- # it returns an array of episode information
35
- def self.generate_episode_information(episode)
36
-
37
- raise ArgumentError, "Serienrenamer::Episode instance needed" unless
38
- episode.is_a?(Serienrenamer::Episode)
39
-
40
- return [] unless episode.series.match(/\w+/)
41
-
42
- unless defined? @cached_data
43
- @cached_data = Hash.new
44
- end
45
-
46
- wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
47
-
48
- if ! @cached_data.has_key?(episode.series)
49
- # search for a series site in wikipedia
50
- series_site = nil
51
- tries = 3
52
- search_pattern = episode.series
53
- search_pattern_modified = false
54
-
55
- begin
56
- wiki.search(search_pattern, nil, 15).each do |title|
57
- pagedata = wiki.get(title)
58
- if is_series_main_page?(pagedata)
59
- series_site = title
60
- break
61
- end
62
- end
63
-
64
- # modify the search term pattern so that it contains
65
- # only the last word if the search_pattern contains
66
- # more than one words
67
- if series_site.nil? && ! search_pattern_modified
68
- search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
69
- search_pattern_modified = true
70
- raise EOFError if search_pattern # break out and retry
71
- end
72
- rescue MediaWiki::APIError => e
73
- tries -= 1
74
- retry if tries > 0
75
- rescue EOFError => e
76
- retry
77
- end
78
-
79
- return [] unless series_site
80
-
81
- # look for a link to a list of episodes
82
- pagedata = wiki.get(series_site)
83
-
84
- if contains_link_to_episode_list?(pagedata)
85
- mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
86
- if mainarticle
87
- episodelist_page = wiki.get(mainarticle)
88
- series = parse_episodelist_page_data(episodelist_page)
89
-
90
- @cached_data[episode.series] = series
91
- end
92
-
93
- elsif contains_inarticle_episode_list?(pagedata)
94
- series = parse_inarticle_episodelist_page_data(pagedata)
95
- @cached_data[episode.series] = series
96
-
97
- else
98
- warn "no episode list found"
99
- return []
100
- end
101
- end
102
-
103
- episode_names = []
104
-
105
- # tries to find an episodename in cached_data
106
- # otherwise returns empty array
107
- begin
108
- series = @cached_data[episode.series]
109
- episodename = series[episode.season][episode.episode]
110
- if episodename.match(/\w+/)
111
- episode_names.push(episodename)
112
- end
113
- rescue
114
- end
115
-
116
- return episode_names
117
- end
118
-
119
-
120
- # This method will extract season based information
121
- # from a string that contains a wikipedia episodelist page
122
- #
123
- # returns an Array of Arrays with episode information
124
- # where episode and season numbers are the indizes
125
- def self.parse_episodelist_page_data(pagedata, debug=false)
126
- raise ArgumentError, 'String with pagedata expected' unless
127
- pagedata.is_a?(String)
128
-
129
- series_data = []
130
- is_season_table_following = false
131
- season_number = nil
132
-
133
- # split the wikipedia page by headings and process
134
- # the following paragraph if the heading starts with
135
- # 'Staffel'
136
- pagedata.split(/(==.*)==/).each do |paragraph|
137
- if paragraph.match(/^==.*Staffel/)
138
- match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
139
- if match
140
- season_number = match[:seasonnr].to_i
141
- is_season_table_following = true
142
- end
143
- elsif is_season_table_following
144
- #
145
- # extract season table from this paragraph
146
- season = parse_season_table(paragraph)
147
-
148
- series_data[season_number] = season
149
- is_season_table_following = false
150
- end
151
- end
152
-
153
- return series_data
154
- end
155
-
156
-
157
- # this method will be called with a wikipedia seasontable
158
- # as parameter and will extract all episodes from this
159
- # and returns that as an array where the episode number is
160
- # the index
161
- def self.parse_season_table(table)
162
- raise ArgumentError, 'String with seasontable expected' unless
163
- table.is_a?(String)
164
-
165
- season_data = []
166
-
167
- matched_table = table.match(@@EPISODE_TABLE_PATTERN)
168
- if matched_table
169
-
170
- # extract all episode entries that
171
- # looks like the following
172
- #
173
- # {{Episodenlisteneintrag
174
- # | NR_GES = 107
175
- # | NR_ST = 1
176
- # | OT = The Mastodon in the Room
177
- # | DT = Die Rückkehr der Scheuklappen
178
- # | ZF =
179
- # | EA = {{dts|23|09|2010}}
180
- # | EAD = {{dts|08|09|2011}}
181
- # }}
182
-
183
- episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
184
- if episodes
185
- episodes.each do |epi|
186
-
187
- # build up a hash from the entry
188
- infos = {}
189
- epi.lines.each do |part|
190
- parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
191
- if parts
192
- infos[parts[:key].strip] = parts[:value].strip
193
- end
194
- end
195
-
196
- next unless infos.has_key?('NR_ST')
197
-
198
- # extract useful information and
199
- # add it to the array
200
- epi_nr = infos['NR_ST'].to_i
201
- next unless epi_nr
202
-
203
- # TODO make the following variable
204
- epi_name = infos['DT'].strip
205
-
206
- # remove all html tags and all following
207
- # text from the episode name and the bold
208
- # syntax from mediawiki [[text]]
209
- epi_name.gsub!(/<\/?[^>]*>.*/, "")
210
- epi_name.gsub!(/[\[\[\]\]]/, "")
211
- next unless epi_name.match(/\w+/)
212
-
213
- season_data[epi_nr] = epi_name
214
- end
215
- end
216
- end
217
- return season_data
218
- end
219
-
220
-
221
- # This method will extract season based information
222
- # from a string that contains a series page with an
223
- # episodelist included
224
- #
225
- # returns an Array of Arrays with episode information
226
- # where episode and season numbers are the indizes
227
- def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
228
- raise ArgumentError, 'String with pagedata expected' unless
229
- pagedata.is_a?(String)
230
-
231
- series_data = []
232
-
233
- # look for a paragraph with an episodelist
234
- episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
235
- contains_inarticle_episode_list?(p) }[0]
236
-
237
- raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
238
-
239
- # iterate through all seasons in this episode table
240
- episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
241
- next unless contains_inarticle_episode_list?(season)
242
-
243
- season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
244
-
245
- wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
246
-
247
- # we have to detect the type of the inarticle season page
248
- # because there are two different kinds of table structures
249
- # used in the german wikipedia
250
- if self.is_episode_list_with_one_episode_per_line?(wikitable)
251
- episodes = parse_inarticle_season_table_with_one_line(wikitable)
252
- else
253
- episodes = parse_inarticle_season_table(wikitable)
254
- end
255
-
256
- # HACK if a season is splitted into different parts
257
- # eg. Flashpoint (2.1 and 2.2) than merge that if possible
258
- if series_data[season_nr] != nil
259
- series_data[season_nr].each_with_index do |item, index|
260
- episodes[index] = item unless episodes[index]
261
- end
262
- end
263
-
264
- series_data[season_nr] = episodes
265
- end
266
-
267
- return series_data
268
- end
269
-
270
-
271
- # this method will be called with a wikitable for a season
272
- # as parameter and will extract all episodes from this
273
- # and returns that as an array where the episode number is
274
- # the index
275
- #
276
- # Example for an wikitable for episodes:
277
- #
278
- # {| class="wikitable" width="100%"
279
- # |- vertical-align: top; text-align:center; "
280
- # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
281
- # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
282
- # ! width="250" | Originaltitel
283
- # ! width="250" | Deutscher Titel
284
- # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
285
- # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
286
- # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
287
- # |-
288
- # | bgcolor="#DFEEEF"| 01
289
- # | 01
290
- # | ''Pilot''
291
- # | ''Auch Reiche sind nur Menschen''
292
- # | 4. Mai 2009
293
- # | 17. Mai 2011
294
- # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
295
- # |-
296
- # |}
297
- #
298
- def self.parse_inarticle_season_table(table)
299
- raise ArgumentError, 'String with seasontable expected' unless
300
- table.is_a?(String)
301
-
302
- season_data = []
303
- episode_nr_line_nr = nil
304
- episode_name_line_nr = nil
305
-
306
- table.split(/^\|\-.*$/).each do |tablerow|
307
- tablerow.strip!
308
-
309
- # skip invalid rows
310
- lines = tablerow.lines.to_a
311
- next unless lines.length >= 4
312
-
313
- if tablerow.match(/width=\"\d+\"/)
314
- # extract line numbers for needed data that
315
- # are in the table header
316
- lines.each_with_index do |item, index|
317
- if item.match(/Nummer.*Staffel/i)
318
- episode_nr_line_nr = index
319
-
320
- # TODO make the following more variable
321
- elsif item.match(/Deutscher.*Titel/i)
322
- episode_name_line_nr = index
323
- end
324
- end
325
- else
326
- # extract episode information
327
- if episode_nr_line_nr && episode_name_line_nr
328
-
329
- md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
330
- if md_nr
331
- episode_nr = md_nr[1].to_i
332
-
333
- md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
334
- if md_name
335
- episode_name = md_name[1]
336
- episode_name.gsub!(/[\'\"\[\]]/, "")
337
- next unless episode_name.match(/\w+/)
338
-
339
- season_data[episode_nr] = episode_name.strip
340
- end
341
- end
342
- end
343
- end
344
- end
345
-
346
- return season_data
347
- end
348
-
349
-
350
- # this method will be called with a wikitable for a season
351
- # as parameter and will extract all episodes from this
352
- # and returns that as an array where the episode number is
353
- # the index
354
- #
355
- # this method implements a special format that takes place in
356
- # e.g. 'Prison Break' where an episode is not spread along several
357
- # lines like in the method above
358
- #
359
- # Example for an wikitable for episodes:
360
- #
361
- #{| class="wikitable"
362
- # |- style="color:#black; background-color:#006699"
363
- # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
364
- # |-
365
- # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
366
- # |-
367
- # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
368
- # |-
369
- # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
370
- # |-
371
- # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
372
- #
373
- def self.parse_inarticle_season_table_with_one_line(table)
374
- raise ArgumentError, 'String with seasontable expected' unless
375
- table.is_a?(String)
376
-
377
- season_data = []
378
- episode_nr_col = nil
379
- episode_name_col = nil
380
-
381
- table.split(/^\|\-.*$/).each do |tablerow|
382
-
383
- if tablerow.match(/!!.*!!.*!!/)
384
- # extract column numbers from table header
385
- tablerow.split(/!!/).each_with_index do |col,index|
386
- episode_nr_col = index if col.match(/Episode/i)
387
- episode_name_col = index if col.match(/Deutsch.*Titel/i)
388
- end
389
-
390
- elsif tablerow.match(/\|\|.*\w+.*\|\|/)
391
- tablerow.strip!
392
- columns = tablerow.split(/\|\|/)
393
-
394
- # the following cleanes up the column so that the following occurs
395
- # " '''7''' (1-07) " => "7 1 07"
396
- #
397
- # we can now extract the last bunch of digits and this algorithm is
398
- # some kind of format independent
399
- dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip
400
- episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
401
- next unless episode_nr
402
-
403
- episode_name = columns[episode_name_col].strip
404
- next unless episode_nr.match(/\w+/)
405
-
406
- season_data[episode_nr.to_i] = episode_name
407
- end
408
- end
409
-
410
- return season_data
411
- end
412
-
413
-
414
- # this method checks if the page is the main page
415
- # for a series
416
- #
417
- # returns true if page contains the infobox that
418
- # is typical for series pages in wikipedia
419
- def self.is_series_main_page?(page)
420
- page.match(@@SERIES_SITE_TEST_PATTERN) != nil
421
- end
422
-
423
- # check the site if it is a disambiguation site
424
- #
425
- # returns true if this site links to pages with
426
- # themes with the same name
427
- def self.is_disambiguation_site?(page)
428
- page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
429
- end
430
-
431
- # test if the page contains a link to an article
432
- # with an episode list
433
- def self.contains_link_to_episode_list?(page)
434
- page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
435
- end
436
-
437
- # test if the page contains a episode list
438
- def self.contains_inarticle_episode_list?(page)
439
- page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
440
- end
441
-
442
- # tests for the type of in article episode list
443
- def self.is_episode_list_with_one_episode_per_line?(page)
444
- page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
445
- end
446
- end
447
- end
448
- end