serienrenamer 0.0.14 → 0.0.15

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,446 +0,0 @@
1
- # encoding: UTF-8
2
- require 'media_wiki'
3
-
4
- module Plugin
5
-
6
- # This Plugin tries to extract the series
7
- # information from wikipedia
8
- #
9
- # (by now only the german wikipedia)
10
- class Wikipedia < Serienrenamer::Pluginbase
11
-
12
- def self.plugin_name; "Wikipedia" end
13
- def self.usable; true end
14
- def self.priority; 30 end
15
-
16
- @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
17
-
18
- # patterns used in this class
19
- @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
20
- @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
21
- @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
22
- @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
23
- @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
24
- @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
25
- @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
26
- @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
27
- @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
28
-
29
-
30
- # this method will be called from the main program
31
- # with an Serienrenamer::Episode instance as parameter
32
- #
33
- # it returns an array of episode information
34
- def self.generate_episode_information(episode)
35
-
36
- raise ArgumentError, "Serienrenamer::Episode instance needed" unless
37
- episode.is_a?(Serienrenamer::Episode)
38
-
39
- return [] unless episode.series.match(/\w+/)
40
-
41
- unless defined? @cached_data
42
- @cached_data = Hash.new
43
- end
44
-
45
- wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
46
-
47
- if ! @cached_data.has_key?(episode.series)
48
- # search for a series site in wikipedia
49
- series_site = nil
50
- tries = 3
51
- search_pattern = episode.series
52
- search_pattern_modified = false
53
-
54
- begin
55
- wiki.search(search_pattern, nil, 15).each do |title|
56
- pagedata = wiki.get(title)
57
- if is_series_main_page?(pagedata)
58
- series_site = title
59
- break
60
- end
61
- end
62
-
63
- # modify the search term pattern so that it contains
64
- # only the last word if the search_pattern contains
65
- # more than one words
66
- if series_site.nil? && ! search_pattern_modified
67
- search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
68
- search_pattern_modified = true
69
- raise EOFError if search_pattern # break out and retry
70
- end
71
- rescue MediaWiki::APIError => e
72
- tries -= 1
73
- retry if tries > 0
74
- rescue EOFError => e
75
- retry
76
- end
77
-
78
- return [] unless series_site
79
-
80
- # look for a link to a list of episodes
81
- pagedata = wiki.get(series_site)
82
-
83
- if contains_link_to_episode_list?(pagedata)
84
- mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
85
- if mainarticle
86
- episodelist_page = wiki.get(mainarticle)
87
- series = parse_episodelist_page_data(episodelist_page)
88
-
89
- @cached_data[episode.series] = series
90
- end
91
-
92
- elsif contains_inarticle_episode_list?(pagedata)
93
- series = parse_inarticle_episodelist_page_data(pagedata)
94
- @cached_data[episode.series] = series
95
-
96
- else
97
- warn "no episode list found"
98
- return []
99
- end
100
- end
101
-
102
- episode_names = []
103
-
104
- # tries to find an episodename in cached_data
105
- # otherwise returns empty array
106
- begin
107
- series = @cached_data[episode.series]
108
- episodename = series[episode.season][episode.episode]
109
- if episodename.match(/\w+/)
110
- episode_names.push(episodename)
111
- end
112
- rescue
113
- end
114
-
115
- return episode_names
116
- end
117
-
118
-
119
- # This method will extract season based information
120
- # from a string that contains a wikipedia episodelist page
121
- #
122
- # returns an Array of Arrays with episode information
123
- # where episode and season numbers are the indizes
124
- def self.parse_episodelist_page_data(pagedata, debug=false)
125
- raise ArgumentError, 'String with pagedata expected' unless
126
- pagedata.is_a?(String)
127
-
128
- series_data = []
129
- is_season_table_following = false
130
- season_number = nil
131
-
132
- # split the wikipedia page by headings and process
133
- # the following paragraph if the heading starts with
134
- # 'Staffel'
135
- pagedata.split(/(==.*)==/).each do |paragraph|
136
- if paragraph.match(/^==.*Staffel/)
137
- match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
138
- if match
139
- season_number = match[:seasonnr].to_i
140
- is_season_table_following = true
141
- end
142
- elsif is_season_table_following
143
- #
144
- # extract season table from this paragraph
145
- season = parse_season_table(paragraph)
146
-
147
- series_data[season_number] = season
148
- is_season_table_following = false
149
- end
150
- end
151
-
152
- return series_data
153
- end
154
-
155
-
156
- # this method will be called with a wikipedia seasontable
157
- # as parameter and will extract all episodes from this
158
- # and returns that as an array where the episode number is
159
- # the index
160
- def self.parse_season_table(table)
161
- raise ArgumentError, 'String with seasontable expected' unless
162
- table.is_a?(String)
163
-
164
- season_data = []
165
-
166
- matched_table = table.match(@@EPISODE_TABLE_PATTERN)
167
- if matched_table
168
-
169
- # extract all episode entries that
170
- # looks like the following
171
- #
172
- # {{Episodenlisteneintrag
173
- # | NR_GES = 107
174
- # | NR_ST = 1
175
- # | OT = The Mastodon in the Room
176
- # | DT = Die Rückkehr der Scheuklappen
177
- # | ZF =
178
- # | EA = {{dts|23|09|2010}}
179
- # | EAD = {{dts|08|09|2011}}
180
- # }}
181
-
182
- episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
183
- if episodes
184
- episodes.each do |epi|
185
-
186
- # build up a hash from the entry
187
- infos = {}
188
- epi.lines.each do |part|
189
- parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
190
- if parts
191
- infos[parts[:key].strip] = parts[:value].strip
192
- end
193
- end
194
-
195
- next unless infos.has_key?('NR_ST')
196
-
197
- # extract useful information and
198
- # add it to the array
199
- epi_nr = infos['NR_ST'].to_i
200
- next unless epi_nr
201
-
202
- # TODO make the following variable
203
- epi_name = infos['DT'].strip
204
-
205
- # remove all html tags and all following
206
- # text from the episode name and the bold
207
- # syntax from mediawiki [[text]]
208
- epi_name.gsub!(/<\/?[^>]*>.*/, "")
209
- epi_name.gsub!(/[\[\[\]\]]/, "")
210
- next unless epi_name.match(/\w+/)
211
-
212
- season_data[epi_nr] = epi_name
213
- end
214
- end
215
- end
216
- return season_data
217
- end
218
-
219
-
220
- # This method will extract season based information
221
- # from a string that contains a series page with an
222
- # episodelist included
223
- #
224
- # returns an Array of Arrays with episode information
225
- # where episode and season numbers are the indizes
226
- def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
227
- raise ArgumentError, 'String with pagedata expected' unless
228
- pagedata.is_a?(String)
229
-
230
- series_data = []
231
-
232
- # look for a paragraph with an episodelist
233
- episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
234
- contains_inarticle_episode_list?(p) }[0]
235
-
236
- raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
237
-
238
- # iterate through all seasons in this episode table
239
- episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
240
- next unless contains_inarticle_episode_list?(season)
241
-
242
- season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
243
-
244
- wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
245
-
246
- # we have to detect the type of the inarticle season page
247
- # because there are two different kinds of table structures
248
- # used in the german wikipedia
249
- if self.is_episode_list_with_one_episode_per_line?(wikitable)
250
- episodes = parse_inarticle_season_table_with_one_line(wikitable)
251
- else
252
- episodes = parse_inarticle_season_table(wikitable)
253
- end
254
-
255
- # HACK if a season is splitted into different parts
256
- # eg. Flashpoint (2.1 and 2.2) than merge that if possible
257
- if series_data[season_nr] != nil
258
- series_data[season_nr].each_with_index do |item, index|
259
- episodes[index] = item unless episodes[index]
260
- end
261
- end
262
-
263
- series_data[season_nr] = episodes
264
- end
265
-
266
- return series_data
267
- end
268
-
269
-
270
- # this method will be called with a wikitable for a season
271
- # as parameter and will extract all episodes from this
272
- # and returns that as an array where the episode number is
273
- # the index
274
- #
275
- # Example for an wikitable for episodes:
276
- #
277
- # {| class="wikitable" width="100%"
278
- # |- vertical-align: top; text-align:center; "
279
- # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
280
- # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
281
- # ! width="250" | Originaltitel
282
- # ! width="250" | Deutscher Titel
283
- # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
284
- # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
285
- # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
286
- # |-
287
- # | bgcolor="#DFEEEF"| 01
288
- # | 01
289
- # | ''Pilot''
290
- # | ''Auch Reiche sind nur Menschen''
291
- # | 4. Mai 2009
292
- # | 17. Mai 2011
293
- # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
294
- # |-
295
- # |}
296
- #
297
- def self.parse_inarticle_season_table(table)
298
- raise ArgumentError, 'String with seasontable expected' unless
299
- table.is_a?(String)
300
-
301
- season_data = []
302
- episode_nr_line_nr = nil
303
- episode_name_line_nr = nil
304
-
305
- table.split(/^\|\-.*$/).each do |tablerow|
306
- tablerow.strip!
307
-
308
- # skip invalid rows
309
- lines = tablerow.lines.to_a
310
- next unless lines.length >= 4
311
-
312
- if tablerow.match(/width=\"\d+\"/)
313
- # extract line numbers for needed data that
314
- # are in the table header
315
- lines.each_with_index do |item, index|
316
- if item.match(/Nummer.*Staffel/i)
317
- episode_nr_line_nr = index
318
-
319
- # TODO make the following more variable
320
- elsif item.match(/Deutscher.*Titel/i)
321
- episode_name_line_nr = index
322
- end
323
- end
324
- else
325
- # extract episode information
326
- if episode_nr_line_nr && episode_name_line_nr
327
-
328
- md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
329
- if md_nr
330
- episode_nr = md_nr[1].to_i
331
-
332
- md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
333
- if md_name
334
- episode_name = md_name[1]
335
- episode_name.gsub!(/[\'\"\[\]]/, "")
336
- next unless episode_name.match(/\w+/)
337
-
338
- season_data[episode_nr] = episode_name.strip
339
- end
340
- end
341
- end
342
- end
343
- end
344
-
345
- return season_data
346
- end
347
-
348
-
349
- # this method will be called with a wikitable for a season
350
- # as parameter and will extract all episodes from this
351
- # and returns that as an array where the episode number is
352
- # the index
353
- #
354
- # this method implements a special format that takes place in
355
- # e.g. 'Prison Break' where an episode is not spread along several
356
- # lines like in the method above
357
- #
358
- # Example for an wikitable for episodes:
359
- #
360
- #{| class="wikitable"
361
- # |- style="color:#black; background-color:#006699"
362
- # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
363
- # |-
364
- # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
365
- # |-
366
- # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
367
- # |-
368
- # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
369
- # |-
370
- # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
371
- #
372
- def self.parse_inarticle_season_table_with_one_line(table)
373
- raise ArgumentError, 'String with seasontable expected' unless
374
- table.is_a?(String)
375
-
376
- season_data = []
377
- episode_nr_col = nil
378
- episode_name_col = nil
379
-
380
- table.split(/^\|\-.*$/).each do |tablerow|
381
-
382
- if tablerow.match(/!!.*!!.*!!/)
383
- # extract column numbers from table header
384
- tablerow.split(/!!/).each_with_index do |col,index|
385
- episode_nr_col = index if col.match(/Episode/i)
386
- episode_name_col = index if col.match(/Deutsch.*Titel/i)
387
- end
388
-
389
- elsif tablerow.match(/\|\|.*\w+.*\|\|/)
390
- tablerow.strip!
391
- columns = tablerow.split(/\|\|/)
392
-
393
- # the following cleanes up the column so that the following occurs
394
- # " '''7''' (1-07) " => "7 1 07"
395
- #
396
- # we can now extract the last bunch of digits and this algorithm is
397
- # some kind of format independent
398
- dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip
399
- episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
400
- next unless episode_nr
401
-
402
- episode_name = columns[episode_name_col].strip
403
- next unless episode_nr.match(/\w+/)
404
-
405
- season_data[episode_nr.to_i] = episode_name
406
- end
407
- end
408
-
409
- return season_data
410
- end
411
-
412
-
413
- # this method checks if the page is the main page
414
- # for a series
415
- #
416
- # returns true if page contains the infobox that
417
- # is typical for series pages in wikipedia
418
- def self.is_series_main_page?(page)
419
- page.match(@@SERIES_SITE_TEST_PATTERN) != nil
420
- end
421
-
422
- # check the site if it is a disambiguation site
423
- #
424
- # returns true if this site links to pages with
425
- # themes with the same name
426
- def self.is_disambiguation_site?(page)
427
- page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
428
- end
429
-
430
- # test if the page contains a link to an article
431
- # with an episode list
432
- def self.contains_link_to_episode_list?(page)
433
- page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
434
- end
435
-
436
- # test if the page contains a episode list
437
- def self.contains_inarticle_episode_list?(page)
438
- page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
439
- end
440
-
441
- # tests for the type of in article episode list
442
- def self.is_episode_list_with_one_episode_per_line?(page)
443
- page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
444
- end
445
- end
446
- end
data/lib/plugin.rb DELETED
@@ -1,8 +0,0 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
-
4
-
5
- module Plugin
6
-
7
- Dir[File.dirname(__FILE__) + '/plugin/*.rb'].each {|file| require file }
8
- end