serienrenamer 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,446 +0,0 @@
1
- # encoding: UTF-8
2
- require 'media_wiki'
3
-
4
- module Plugin
5
-
6
- # This Plugin tries to extract the series
7
- # information from wikipedia
8
- #
9
- # (by now only the german wikipedia)
10
- class Wikipedia < Serienrenamer::Pluginbase
11
-
12
- def self.plugin_name; "Wikipedia" end
13
- def self.usable; true end
14
- def self.priority; 30 end
15
-
16
- @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
17
-
18
- # patterns used in this class
19
- @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
20
- @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
21
- @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
22
- @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
23
- @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
24
- @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
25
- @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
26
- @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
27
- @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
28
-
29
-
30
- # this method will be called from the main program
31
- # with an Serienrenamer::Episode instance as parameter
32
- #
33
- # it returns an array of episode information
34
- def self.generate_episode_information(episode)
35
-
36
- raise ArgumentError, "Serienrenamer::Episode instance needed" unless
37
- episode.is_a?(Serienrenamer::Episode)
38
-
39
- return [] unless episode.series.match(/\w+/)
40
-
41
- unless defined? @cached_data
42
- @cached_data = Hash.new
43
- end
44
-
45
- wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
46
-
47
- if ! @cached_data.has_key?(episode.series)
48
- # search for a series site in wikipedia
49
- series_site = nil
50
- tries = 3
51
- search_pattern = episode.series
52
- search_pattern_modified = false
53
-
54
- begin
55
- wiki.search(search_pattern, nil, 15).each do |title|
56
- pagedata = wiki.get(title)
57
- if is_series_main_page?(pagedata)
58
- series_site = title
59
- break
60
- end
61
- end
62
-
63
- # modify the search term pattern so that it contains
64
- # only the last word if the search_pattern contains
65
- # more than one words
66
- if series_site.nil? && ! search_pattern_modified
67
- search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
68
- search_pattern_modified = true
69
- raise EOFError if search_pattern # break out and retry
70
- end
71
- rescue MediaWiki::APIError => e
72
- tries -= 1
73
- retry if tries > 0
74
- rescue EOFError => e
75
- retry
76
- end
77
-
78
- return [] unless series_site
79
-
80
- # look for a link to a list of episodes
81
- pagedata = wiki.get(series_site)
82
-
83
- if contains_link_to_episode_list?(pagedata)
84
- mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
85
- if mainarticle
86
- episodelist_page = wiki.get(mainarticle)
87
- series = parse_episodelist_page_data(episodelist_page)
88
-
89
- @cached_data[episode.series] = series
90
- end
91
-
92
- elsif contains_inarticle_episode_list?(pagedata)
93
- series = parse_inarticle_episodelist_page_data(pagedata)
94
- @cached_data[episode.series] = series
95
-
96
- else
97
- warn "no episode list found"
98
- return []
99
- end
100
- end
101
-
102
- episode_names = []
103
-
104
- # tries to find an episodename in cached_data
105
- # otherwise returns empty array
106
- begin
107
- series = @cached_data[episode.series]
108
- episodename = series[episode.season][episode.episode]
109
- if episodename.match(/\w+/)
110
- episode_names.push(episodename)
111
- end
112
- rescue
113
- end
114
-
115
- return episode_names
116
- end
117
-
118
-
119
- # This method will extract season based information
120
- # from a string that contains a wikipedia episodelist page
121
- #
122
- # returns an Array of Arrays with episode information
123
- # where episode and season numbers are the indizes
124
- def self.parse_episodelist_page_data(pagedata, debug=false)
125
- raise ArgumentError, 'String with pagedata expected' unless
126
- pagedata.is_a?(String)
127
-
128
- series_data = []
129
- is_season_table_following = false
130
- season_number = nil
131
-
132
- # split the wikipedia page by headings and process
133
- # the following paragraph if the heading starts with
134
- # 'Staffel'
135
- pagedata.split(/(==.*)==/).each do |paragraph|
136
- if paragraph.match(/^==.*Staffel/)
137
- match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
138
- if match
139
- season_number = match[:seasonnr].to_i
140
- is_season_table_following = true
141
- end
142
- elsif is_season_table_following
143
- #
144
- # extract season table from this paragraph
145
- season = parse_season_table(paragraph)
146
-
147
- series_data[season_number] = season
148
- is_season_table_following = false
149
- end
150
- end
151
-
152
- return series_data
153
- end
154
-
155
-
156
- # this method will be called with a wikipedia seasontable
157
- # as parameter and will extract all episodes from this
158
- # and returns that as an array where the episode number is
159
- # the index
160
- def self.parse_season_table(table)
161
- raise ArgumentError, 'String with seasontable expected' unless
162
- table.is_a?(String)
163
-
164
- season_data = []
165
-
166
- matched_table = table.match(@@EPISODE_TABLE_PATTERN)
167
- if matched_table
168
-
169
- # extract all episode entries that
170
- # looks like the following
171
- #
172
- # {{Episodenlisteneintrag
173
- # | NR_GES = 107
174
- # | NR_ST = 1
175
- # | OT = The Mastodon in the Room
176
- # | DT = Die Rückkehr der Scheuklappen
177
- # | ZF =
178
- # | EA = {{dts|23|09|2010}}
179
- # | EAD = {{dts|08|09|2011}}
180
- # }}
181
-
182
- episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
183
- if episodes
184
- episodes.each do |epi|
185
-
186
- # build up a hash from the entry
187
- infos = {}
188
- epi.lines.each do |part|
189
- parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
190
- if parts
191
- infos[parts[:key].strip] = parts[:value].strip
192
- end
193
- end
194
-
195
- next unless infos.has_key?('NR_ST')
196
-
197
- # extract useful information and
198
- # add it to the array
199
- epi_nr = infos['NR_ST'].to_i
200
- next unless epi_nr
201
-
202
- # TODO make the following variable
203
- epi_name = infos['DT'].strip
204
-
205
- # remove all html tags and all following
206
- # text from the episode name and the bold
207
- # syntax from mediawiki [[text]]
208
- epi_name.gsub!(/<\/?[^>]*>.*/, "")
209
- epi_name.gsub!(/[\[\[\]\]]/, "")
210
- next unless epi_name.match(/\w+/)
211
-
212
- season_data[epi_nr] = epi_name
213
- end
214
- end
215
- end
216
- return season_data
217
- end
218
-
219
-
220
- # This method will extract season based information
221
- # from a string that contains a series page with an
222
- # episodelist included
223
- #
224
- # returns an Array of Arrays with episode information
225
- # where episode and season numbers are the indizes
226
- def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
227
- raise ArgumentError, 'String with pagedata expected' unless
228
- pagedata.is_a?(String)
229
-
230
- series_data = []
231
-
232
- # look for a paragraph with an episodelist
233
- episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
234
- contains_inarticle_episode_list?(p) }[0]
235
-
236
- raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
237
-
238
- # iterate through all seasons in this episode table
239
- episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
240
- next unless contains_inarticle_episode_list?(season)
241
-
242
- season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
243
-
244
- wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
245
-
246
- # we have to detect the type of the inarticle season page
247
- # because there are two different kinds of table structures
248
- # used in the german wikipedia
249
- if self.is_episode_list_with_one_episode_per_line?(wikitable)
250
- episodes = parse_inarticle_season_table_with_one_line(wikitable)
251
- else
252
- episodes = parse_inarticle_season_table(wikitable)
253
- end
254
-
255
- # HACK if a season is splitted into different parts
256
- # eg. Flashpoint (2.1 and 2.2) than merge that if possible
257
- if series_data[season_nr] != nil
258
- series_data[season_nr].each_with_index do |item, index|
259
- episodes[index] = item unless episodes[index]
260
- end
261
- end
262
-
263
- series_data[season_nr] = episodes
264
- end
265
-
266
- return series_data
267
- end
268
-
269
-
270
- # this method will be called with a wikitable for a season
271
- # as parameter and will extract all episodes from this
272
- # and returns that as an array where the episode number is
273
- # the index
274
- #
275
- # Example for an wikitable for episodes:
276
- #
277
- # {| class="wikitable" width="100%"
278
- # |- vertical-align: top; text-align:center; "
279
- # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
280
- # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
281
- # ! width="250" | Originaltitel
282
- # ! width="250" | Deutscher Titel
283
- # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
284
- # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
285
- # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
286
- # |-
287
- # | bgcolor="#DFEEEF"| 01
288
- # | 01
289
- # | ''Pilot''
290
- # | ''Auch Reiche sind nur Menschen''
291
- # | 4. Mai 2009
292
- # | 17. Mai 2011
293
- # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
294
- # |-
295
- # |}
296
- #
297
- def self.parse_inarticle_season_table(table)
298
- raise ArgumentError, 'String with seasontable expected' unless
299
- table.is_a?(String)
300
-
301
- season_data = []
302
- episode_nr_line_nr = nil
303
- episode_name_line_nr = nil
304
-
305
- table.split(/^\|\-.*$/).each do |tablerow|
306
- tablerow.strip!
307
-
308
- # skip invalid rows
309
- lines = tablerow.lines.to_a
310
- next unless lines.length >= 4
311
-
312
- if tablerow.match(/width=\"\d+\"/)
313
- # extract line numbers for needed data that
314
- # are in the table header
315
- lines.each_with_index do |item, index|
316
- if item.match(/Nummer.*Staffel/i)
317
- episode_nr_line_nr = index
318
-
319
- # TODO make the following more variable
320
- elsif item.match(/Deutscher.*Titel/i)
321
- episode_name_line_nr = index
322
- end
323
- end
324
- else
325
- # extract episode information
326
- if episode_nr_line_nr && episode_name_line_nr
327
-
328
- md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
329
- if md_nr
330
- episode_nr = md_nr[1].to_i
331
-
332
- md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
333
- if md_name
334
- episode_name = md_name[1]
335
- episode_name.gsub!(/[\'\"\[\]]/, "")
336
- next unless episode_name.match(/\w+/)
337
-
338
- season_data[episode_nr] = episode_name.strip
339
- end
340
- end
341
- end
342
- end
343
- end
344
-
345
- return season_data
346
- end
347
-
348
-
349
- # this method will be called with a wikitable for a season
350
- # as parameter and will extract all episodes from this
351
- # and returns that as an array where the episode number is
352
- # the index
353
- #
354
- # this method implements a special format that takes place in
355
- # e.g. 'Prison Break' where an episode is not spread along several
356
- # lines like in the method above
357
- #
358
- # Example for an wikitable for episodes:
359
- #
360
- #{| class="wikitable"
361
- # |- style="color:#black; background-color:#006699"
362
- # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
363
- # |-
364
- # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
365
- # |-
366
- # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
367
- # |-
368
- # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
369
- # |-
370
- # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
371
- #
372
- def self.parse_inarticle_season_table_with_one_line(table)
373
- raise ArgumentError, 'String with seasontable expected' unless
374
- table.is_a?(String)
375
-
376
- season_data = []
377
- episode_nr_col = nil
378
- episode_name_col = nil
379
-
380
- table.split(/^\|\-.*$/).each do |tablerow|
381
-
382
- if tablerow.match(/!!.*!!.*!!/)
383
- # extract column numbers from table header
384
- tablerow.split(/!!/).each_with_index do |col,index|
385
- episode_nr_col = index if col.match(/Episode/i)
386
- episode_name_col = index if col.match(/Deutsch.*Titel/i)
387
- end
388
-
389
- elsif tablerow.match(/\|\|.*\w+.*\|\|/)
390
- tablerow.strip!
391
- columns = tablerow.split(/\|\|/)
392
-
393
- # the following cleanes up the column so that the following occurs
394
- # " '''7''' (1-07) " => "7 1 07"
395
- #
396
- # we can now extract the last bunch of digits and this algorithm is
397
- # some kind of format independent
398
- dirty_episode_nr = columns[episode_nr_col].gsub(/\D/, " ").strip
399
- episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
400
- next unless episode_nr
401
-
402
- episode_name = columns[episode_name_col].strip
403
- next unless episode_nr.match(/\w+/)
404
-
405
- season_data[episode_nr.to_i] = episode_name
406
- end
407
- end
408
-
409
- return season_data
410
- end
411
-
412
-
413
- # this method checks if the page is the main page
414
- # for a series
415
- #
416
- # returns true if page contains the infobox that
417
- # is typical for series pages in wikipedia
418
- def self.is_series_main_page?(page)
419
- page.match(@@SERIES_SITE_TEST_PATTERN) != nil
420
- end
421
-
422
- # check the site if it is a disambiguation site
423
- #
424
- # returns true if this site links to pages with
425
- # themes with the same name
426
- def self.is_disambiguation_site?(page)
427
- page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
428
- end
429
-
430
- # test if the page contains a link to an article
431
- # with an episode list
432
- def self.contains_link_to_episode_list?(page)
433
- page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
434
- end
435
-
436
- # test if the page contains a episode list
437
- def self.contains_inarticle_episode_list?(page)
438
- page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
439
- end
440
-
441
- # tests for the type of in article episode list
442
- def self.is_episode_list_with_one_episode_per_line?(page)
443
- page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
444
- end
445
- end
446
- end
data/lib/plugin.rb DELETED
@@ -1,8 +0,0 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
-
4
-
5
- module Plugin
6
-
7
- Dir[File.dirname(__FILE__) + '/plugin/*.rb'].each {|file| require file }
8
- end