serienrenamer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +23 -0
- data/README.rdoc +52 -0
- data/Rakefile +27 -0
- data/bin/serienrenamer +155 -0
- data/lib/plugin/serienjunkies_de.rb +129 -0
- data/lib/plugin/serienjunkies_feed.rb +105 -0
- data/lib/plugin/textfile.rb +50 -0
- data/lib/plugin/wikipedia.rb +362 -0
- data/lib/plugin.rb +8 -0
- data/lib/serienrenamer/episode.rb +313 -0
- data/lib/serienrenamer.rb +28 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/serienrenamer.gemspec +52 -0
- data/test/serienjunkies_feed_sample.xml +14472 -0
- data/test/test_episode.rb +188 -0
- data/test/test_helper.rb +4 -0
- data/test/test_plugin_serienjunkies_de.rb +95 -0
- data/test/test_plugin_serienjunkies_feed.rb +75 -0
- data/test/test_plugin_textfile.rb +38 -0
- data/test/test_plugin_wikipedia.rb +178 -0
- metadata +161 -0
@@ -0,0 +1,362 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'media_wiki'
|
3
|
+
|
4
|
+
module Plugin
|
5
|
+
|
6
|
+
# This Plugin tries to extract the series
|
7
|
+
# information from wikipedia
|
8
|
+
#
|
9
|
+
# (by now only the german wikipedia)
|
10
|
+
class Wikipedia < Serienrenamer::Pluginbase
|
11
|
+
|
12
|
+
def self.plugin_name; "Wikipedia" end
|
13
|
+
def self.usable; true end
|
14
|
+
def self.priority; 5 end
|
15
|
+
|
16
|
+
@@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
|
17
|
+
|
18
|
+
# patterns used in this class
|
19
|
+
@@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
|
20
|
+
@@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
|
21
|
+
@@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
|
22
|
+
@@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
|
23
|
+
@@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
|
24
|
+
@@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
|
25
|
+
@@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
|
26
|
+
@@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
|
27
|
+
|
28
|
+
# this method will be called from the main program
|
29
|
+
# with an Serienrenamer::Episode instance as parameter
|
30
|
+
#
|
31
|
+
# it returns an array of episode information
|
32
|
+
def self.generate_episode_information(episode)
|
33
|
+
|
34
|
+
raise ArgumentError, "Serienrenamer::Episode instance needed" unless
|
35
|
+
episode.is_a?(Serienrenamer::Episode)
|
36
|
+
|
37
|
+
return [] unless episode.series.match(/\w+/)
|
38
|
+
|
39
|
+
unless defined? @cached_data
|
40
|
+
@cached_data = Hash.new
|
41
|
+
end
|
42
|
+
|
43
|
+
wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
|
44
|
+
|
45
|
+
if ! @cached_data.has_key?(episode.series)
|
46
|
+
# search for a series site in wikipedia
|
47
|
+
series_site = nil
|
48
|
+
tries = 3
|
49
|
+
search_pattern = episode.series
|
50
|
+
search_pattern_modified = false
|
51
|
+
|
52
|
+
begin
|
53
|
+
wiki.search(search_pattern, nil, 50).each do |title|
|
54
|
+
pagedata = wiki.get(title)
|
55
|
+
if is_series_main_page?(pagedata)
|
56
|
+
series_site = title
|
57
|
+
break
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# modify the search term pattern so that it contains
|
62
|
+
# only the last word if the search_pattern contains
|
63
|
+
# more than one words
|
64
|
+
if series_site.nil? && ! search_pattern_modified
|
65
|
+
search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
|
66
|
+
search_pattern_modified = true
|
67
|
+
raise EOFError if search_pattern # break out and retry
|
68
|
+
end
|
69
|
+
rescue MediaWiki::APIError => e
|
70
|
+
tries -= 1
|
71
|
+
retry if tries > 0
|
72
|
+
rescue EOFError => e
|
73
|
+
retry
|
74
|
+
end
|
75
|
+
|
76
|
+
return [] unless series_site
|
77
|
+
|
78
|
+
# look for a link to a list of episodes
|
79
|
+
pagedata = wiki.get(series_site)
|
80
|
+
|
81
|
+
if contains_link_to_episode_list?(pagedata)
|
82
|
+
mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
|
83
|
+
if mainarticle
|
84
|
+
episodelist_page = wiki.get(mainarticle)
|
85
|
+
series = parse_episodelist_page_data(episodelist_page)
|
86
|
+
|
87
|
+
@cached_data[episode.series] = series
|
88
|
+
end
|
89
|
+
|
90
|
+
elsif contains_inarticle_episode_list?(pagedata)
|
91
|
+
series = parse_inarticle_episodelist_page_data(pagedata)
|
92
|
+
@cached_data[episode.series] = series
|
93
|
+
|
94
|
+
else
|
95
|
+
warn "no episode list found"
|
96
|
+
return []
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
episode_names = []
|
101
|
+
|
102
|
+
# tries to find an episodename in cached_data
|
103
|
+
# otherwise returns empty array
|
104
|
+
begin
|
105
|
+
series = @cached_data[episode.series]
|
106
|
+
episodename = series[episode.season][episode.episode]
|
107
|
+
if episodename.match(/\w+/)
|
108
|
+
episode_names.push(episodename)
|
109
|
+
end
|
110
|
+
rescue
|
111
|
+
end
|
112
|
+
|
113
|
+
return episode_names
|
114
|
+
end
|
115
|
+
|
116
|
+
# This method will extract season based information
|
117
|
+
# from a string that contains a wikipedia episodelist page
|
118
|
+
#
|
119
|
+
# returns an Array of Arrays with episode information
|
120
|
+
# where episode and season numbers are the indizes
|
121
|
+
def self.parse_episodelist_page_data(pagedata, debug=false)
|
122
|
+
raise ArgumentError, 'String with pagedata expected' unless
|
123
|
+
pagedata.is_a?(String)
|
124
|
+
|
125
|
+
series_data = []
|
126
|
+
is_season_table_following = false
|
127
|
+
season_number = nil
|
128
|
+
|
129
|
+
# split the wikipedia page by headings and process
|
130
|
+
# the following paragraph if the heading starts with
|
131
|
+
# 'Staffel'
|
132
|
+
pagedata.split(/(==.*)==/).each do |paragraph|
|
133
|
+
if paragraph.match(/^==.*Staffel/)
|
134
|
+
match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
|
135
|
+
if match
|
136
|
+
season_number = match[:seasonnr].to_i
|
137
|
+
is_season_table_following = true
|
138
|
+
end
|
139
|
+
elsif is_season_table_following
|
140
|
+
#
|
141
|
+
# extract season table from this paragraph
|
142
|
+
season = parse_season_table(paragraph)
|
143
|
+
|
144
|
+
series_data[season_number] = season
|
145
|
+
is_season_table_following = false
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
return series_data
|
150
|
+
end
|
151
|
+
|
152
|
+
# this method will be called with a wikipedia seasontable
|
153
|
+
# as parameter and will extract all episodes from this
|
154
|
+
# and returns that as an array where the episode number is
|
155
|
+
# the index
|
156
|
+
def self.parse_season_table(table)
|
157
|
+
raise ArgumentError, 'String with seasontable expected' unless
|
158
|
+
table.is_a?(String)
|
159
|
+
|
160
|
+
season_data = []
|
161
|
+
|
162
|
+
matched_table = table.match(@@EPISODE_TABLE_PATTERN)
|
163
|
+
if matched_table
|
164
|
+
|
165
|
+
# extract all episode entries that
|
166
|
+
# looks like the following
|
167
|
+
#
|
168
|
+
# {{Episodenlisteneintrag
|
169
|
+
# | NR_GES = 107
|
170
|
+
# | NR_ST = 1
|
171
|
+
# | OT = The Mastodon in the Room
|
172
|
+
# | DT = Die Rückkehr der Scheuklappen
|
173
|
+
# | ZF =
|
174
|
+
# | EA = {{dts|23|09|2010}}
|
175
|
+
# | EAD = {{dts|08|09|2011}}
|
176
|
+
# }}
|
177
|
+
|
178
|
+
episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
|
179
|
+
if episodes
|
180
|
+
episodes.each do |epi|
|
181
|
+
|
182
|
+
# build up a hash from the entry
|
183
|
+
infos = {}
|
184
|
+
epi.lines.each do |part|
|
185
|
+
parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
|
186
|
+
if parts
|
187
|
+
infos[parts[:key].strip] = parts[:value].strip
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
next unless infos.has_key?('NR_ST')
|
192
|
+
|
193
|
+
# extract useful information and
|
194
|
+
# add it to the array
|
195
|
+
epi_nr = infos['NR_ST'].to_i
|
196
|
+
next unless epi_nr
|
197
|
+
|
198
|
+
# TODO make the following variable
|
199
|
+
epi_name = infos['DT'].strip
|
200
|
+
|
201
|
+
# remove all html tags and all following
|
202
|
+
# text from the episode name and the bold
|
203
|
+
# syntax from mediawiki [[text]]
|
204
|
+
epi_name.gsub!(/<\/?[^>]*>.*/, "")
|
205
|
+
epi_name.gsub!(/[\[\[\]\]]/, "")
|
206
|
+
next unless epi_name.match(/\w+/)
|
207
|
+
|
208
|
+
season_data[epi_nr] = epi_name
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
return season_data
|
213
|
+
end
|
214
|
+
|
215
|
+
# This method will extract season based information
|
216
|
+
# from a string that contains a series page with an
|
217
|
+
# episodelist included
|
218
|
+
#
|
219
|
+
# returns an Array of Arrays with episode information
|
220
|
+
# where episode and season numbers are the indizes
|
221
|
+
def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
|
222
|
+
raise ArgumentError, 'String with pagedata expected' unless
|
223
|
+
pagedata.is_a?(String)
|
224
|
+
|
225
|
+
series_data = []
|
226
|
+
|
227
|
+
# look for a paragraph with an episodelist
|
228
|
+
episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
|
229
|
+
contains_inarticle_episode_list?(p) }[0]
|
230
|
+
|
231
|
+
raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
|
232
|
+
|
233
|
+
# iterate through all seasons in this episode table
|
234
|
+
episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
|
235
|
+
next unless contains_inarticle_episode_list?(season)
|
236
|
+
|
237
|
+
season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
|
238
|
+
|
239
|
+
wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
|
240
|
+
episodes = parse_inarticle_season_table(wikitable)
|
241
|
+
|
242
|
+
# HACK if a season is splitted into different parts
|
243
|
+
# eg. Flashpoint (2.1 and 2.2) than merge that if possible
|
244
|
+
if series_data[season_nr] != nil
|
245
|
+
series_data[season_nr].each_with_index do |item, index|
|
246
|
+
episodes[index] = item unless episodes[index]
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
series_data[season_nr] = episodes
|
251
|
+
end
|
252
|
+
|
253
|
+
return series_data
|
254
|
+
end
|
255
|
+
|
256
|
+
# this method will be called with a wikitable for a season
|
257
|
+
# as parameter and will extract all episodes from this
|
258
|
+
# and returns that as an array where the episode number is
|
259
|
+
# the index
|
260
|
+
#
|
261
|
+
# Example for an wikitable for episodes:
|
262
|
+
#
|
263
|
+
# {| class="wikitable" width="100%"
|
264
|
+
# |- vertical-align: top; text-align:center; "
|
265
|
+
# | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
|
266
|
+
# | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
|
267
|
+
# ! width="250" | Originaltitel
|
268
|
+
# ! width="250" | Deutscher Titel
|
269
|
+
# ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
|
270
|
+
# ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
|
271
|
+
# ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
|
272
|
+
# |-
|
273
|
+
# | bgcolor="#DFEEEF"| 01
|
274
|
+
# | 01
|
275
|
+
# | ''Pilot''
|
276
|
+
# | ''Auch Reiche sind nur Menschen''
|
277
|
+
# | 4. Mai 2009
|
278
|
+
# | 17. Mai 2011
|
279
|
+
# | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
|
280
|
+
# |-
|
281
|
+
# |}
|
282
|
+
#
|
283
|
+
def self.parse_inarticle_season_table(table)
|
284
|
+
raise ArgumentError, 'String with seasontable expected' unless
|
285
|
+
table.is_a?(String)
|
286
|
+
|
287
|
+
season_data = []
|
288
|
+
episode_nr_line_nr = nil
|
289
|
+
episode_name_line_nr = nil
|
290
|
+
|
291
|
+
table.split(/^\|\-.*$/).each do |tablerow|
|
292
|
+
tablerow.strip!
|
293
|
+
|
294
|
+
# skip invalid rows
|
295
|
+
lines = tablerow.lines.to_a
|
296
|
+
next unless lines.length >= 4
|
297
|
+
|
298
|
+
if tablerow.match(/width=\"\d+\"/)
|
299
|
+
# extract line numbers for needed data that
|
300
|
+
# are in the table header
|
301
|
+
lines.each_with_index do |item, index|
|
302
|
+
if item.match(/Nummer.*Staffel/i)
|
303
|
+
episode_nr_line_nr = index
|
304
|
+
|
305
|
+
# TODO make the following more variable
|
306
|
+
elsif item.match(/Deutscher.*Titel/i)
|
307
|
+
episode_name_line_nr = index
|
308
|
+
end
|
309
|
+
end
|
310
|
+
else
|
311
|
+
# extract episode information
|
312
|
+
if episode_nr_line_nr && episode_name_line_nr
|
313
|
+
|
314
|
+
md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
|
315
|
+
if md_nr
|
316
|
+
episode_nr = md_nr[1].to_i
|
317
|
+
|
318
|
+
md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
|
319
|
+
if md_name
|
320
|
+
episode_name = md_name[1]
|
321
|
+
episode_name.gsub!(/[\'\"\[\]]/, "")
|
322
|
+
next unless episode_name.match(/\w+/)
|
323
|
+
|
324
|
+
season_data[episode_nr] = episode_name.strip
|
325
|
+
end
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
return season_data
|
332
|
+
end
|
333
|
+
|
334
|
+
# this method checks if the page is the main page
|
335
|
+
# for a series
|
336
|
+
#
|
337
|
+
# returns true if page contains the infobox that
|
338
|
+
# is typical for series pages in wikipedia
|
339
|
+
def self.is_series_main_page?(page)
|
340
|
+
page.match(@@SERIES_SITE_TEST_PATTERN) != nil
|
341
|
+
end
|
342
|
+
|
343
|
+
# check the site if it is a disambiguation site
|
344
|
+
#
|
345
|
+
# returns true if this site links to pages with
|
346
|
+
# themes with the same name
|
347
|
+
def self.is_disambiguation_site?(page)
|
348
|
+
page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
|
349
|
+
end
|
350
|
+
|
351
|
+
# test if the page contains a link to an article
|
352
|
+
# with an episode list
|
353
|
+
def self.contains_link_to_episode_list?(page)
|
354
|
+
page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
|
355
|
+
end
|
356
|
+
|
357
|
+
# test if the page contains a episode list
|
358
|
+
def self.contains_inarticle_episode_list?(page)
|
359
|
+
page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
data/lib/plugin.rb
ADDED
@@ -0,0 +1,313 @@
|
|
1
|
+
# coding: UTF-8
|
2
|
+
require 'find'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'wlapi'
|
5
|
+
|
6
|
+
module Serienrenamer
|
7
|
+
|
8
|
+
class Episode
|
9
|
+
|
10
|
+
attr_reader :season, :episode, :episodename,
|
11
|
+
:extension, :episodepath, :success, :source_directory
|
12
|
+
attr_accessor :episodename_needed, :series
|
13
|
+
|
14
|
+
# patterns for suitable episodes
|
15
|
+
@@PATTERNS = [
|
16
|
+
# S01E01
|
17
|
+
/^(?<series>.*)S(?<season>\d+)E(?<episode>\d+)(?<episodename>.*)$/i,
|
18
|
+
# 101; 1212
|
19
|
+
/^(?<series>.*\D)(?<season>\d+)(?<episode>\d{2})(?<episodename>\W*.*)$/,
|
20
|
+
# 1x1; 12x12
|
21
|
+
/^(?<series>.*)(?<season>\d+)x(?<episode>\d+)(?<episodename>.*)$/,
|
22
|
+
]
|
23
|
+
|
24
|
+
# allowed endings for episode files
|
25
|
+
@@ENDINGS = %w( mpg mpeg avi mkv wmv mp4 mov flv 3gp )
|
26
|
+
|
27
|
+
# trash words that are removed from the episodename
|
28
|
+
@@TRASH_WORDS = %w(
|
29
|
+
German Dubbed DVDRip HDTVRip XviD ITG TVR inspired HDRip
|
30
|
+
AMBiTiOUS RSG SiGHT SATRip WS TVS RiP READ GERMAN dTV aTV
|
31
|
+
iNTERNAL CRoW MSE c0nFuSed UTOPiA scum EXPiRED BDRiP HDTV
|
32
|
+
iTunesHD 720p x264 h264 CRiSP euHD WEBRiP ZZGtv ARCHiV DD20
|
33
|
+
Prim3time Nfo Repack SiMPTY BLURAYRiP BluRay DELiCiOUS Synced
|
34
|
+
UNDELiCiOUS fBi CiD iTunesHDRip RedSeven OiNK idTV DL DD51
|
35
|
+
)
|
36
|
+
|
37
|
+
# Constructor for the Episode-Class, which takes an episode as
|
38
|
+
# argument and extracts as much as information from the file
|
39
|
+
# that it can.
|
40
|
+
def initialize(episodepath, episodename_needed=true)
|
41
|
+
|
42
|
+
raise ArgumentError, 'no episodepath provided' unless episodepath
|
43
|
+
|
44
|
+
# make some checks on the given episode path
|
45
|
+
unless File.exists?(episodepath) || Dir.exists?(episodepath)
|
46
|
+
raise ArgumentError, "episodepath not existing"
|
47
|
+
end
|
48
|
+
|
49
|
+
unless Episode.determine_video_file(episodepath)
|
50
|
+
raise ArgumentError, 'no videofile found'
|
51
|
+
end
|
52
|
+
|
53
|
+
@source_directory = nil
|
54
|
+
|
55
|
+
# normalize information for dirs/files
|
56
|
+
basepath = File.basename(episodepath)
|
57
|
+
|
58
|
+
if File.file?(episodepath)
|
59
|
+
basepath = basepath.chomp(File.extname(basepath))
|
60
|
+
elsif File.directory?(episodepath)
|
61
|
+
@source_directory = episodepath
|
62
|
+
|
63
|
+
# if directory does not contain episode information
|
64
|
+
# check for an text file with suitable information
|
65
|
+
unless Episode.contains_episode_information?(basepath)
|
66
|
+
info = Plugin::Textfile.generate_episode_information(episodepath)[0]
|
67
|
+
basepath = info if info
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
unless Episode.contains_episode_information?(basepath)
|
72
|
+
raise ArgumentError, 'no episode information existing'
|
73
|
+
end
|
74
|
+
|
75
|
+
@episodepath = Episode.determine_video_file(episodepath)
|
76
|
+
|
77
|
+
infos = Episode.extract_episode_information(basepath)
|
78
|
+
raise ArgumentError, 'no suitable regex pattern matches' unless infos
|
79
|
+
|
80
|
+
@series = Episode.clean_episode_data(infos[:series]).strip
|
81
|
+
@episodename = Episode.clean_episode_data(
|
82
|
+
infos[:episodename], true, true).strip
|
83
|
+
@season = infos[:season].to_i
|
84
|
+
@episode = infos[:episode].to_i
|
85
|
+
|
86
|
+
# setting up special behaviour
|
87
|
+
@episodename_needed=episodename_needed
|
88
|
+
@extension=File.extname(@episodepath).gsub('.','')
|
89
|
+
@success=false
|
90
|
+
end
|
91
|
+
|
92
|
+
# Returns the episode information into a format like
|
93
|
+
# S0xE0x, depending on @episodename_needed it includes
|
94
|
+
# the episodename
|
95
|
+
def to_s
|
96
|
+
if @episodename_needed
|
97
|
+
return "S%.2dE%.2d - %s.%s" % [ @season, @episode, @episodename, @extension ]
|
98
|
+
else
|
99
|
+
return "S%.2dE%.2d.%s" % [ @season, @episode, @extension ]
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# this method makes it possible to set the episodename
|
104
|
+
# afterwards
|
105
|
+
#
|
106
|
+
# options:
|
107
|
+
# :data
|
108
|
+
# string that contains epissodename information
|
109
|
+
# :need_cleanup
|
110
|
+
# if true than it will apply the standard regex
|
111
|
+
# to clean the string and extracts that with
|
112
|
+
# the standard patterns
|
113
|
+
# if false the string will applied without any
|
114
|
+
# checks or cleanup
|
115
|
+
# :extract_seriesname
|
116
|
+
# tries to extract the seriesname from data
|
117
|
+
def add_episode_information(data, need_cleanup=true, extract_seriesname=false)
|
118
|
+
return unless data
|
119
|
+
|
120
|
+
if need_cleanup
|
121
|
+
if Episode.contains_episode_information?(data)
|
122
|
+
infos = Episode.extract_episode_information(data)
|
123
|
+
if infos
|
124
|
+
data = infos[:episodename]
|
125
|
+
|
126
|
+
# try to extract seriesname if needed
|
127
|
+
if extract_seriesname and infos[:series].match(/\w+/)
|
128
|
+
seriesname = Episode.clean_episode_data(infos[:series])
|
129
|
+
@series = seriesname.strip
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
data = Episode.clean_episode_data(data, true, true).strip
|
134
|
+
end
|
135
|
+
@episodename = data
|
136
|
+
end
|
137
|
+
|
138
|
+
# renames the given episodefile into the new
|
139
|
+
# clean format and sets the status on success
|
140
|
+
#
|
141
|
+
def rename(destination_dir=".")
|
142
|
+
raise IOError, 'episode file not existing' unless File.file?(@episodepath)
|
143
|
+
destination_file = File.join(destination_dir, self.to_s)
|
144
|
+
|
145
|
+
begin
|
146
|
+
File.rename(@episodepath, destination_file)
|
147
|
+
|
148
|
+
if @source_directory
|
149
|
+
FileUtils.remove_dir(@source_directory)
|
150
|
+
end
|
151
|
+
|
152
|
+
@success = true
|
153
|
+
rescue SystemCallError => e
|
154
|
+
puts "Rename failed: #{e}"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
##################
|
159
|
+
# static methods #
|
160
|
+
##################
|
161
|
+
|
162
|
+
# cleans strings from things that can occur in
|
163
|
+
# episode files like dots (.) and trash words
|
164
|
+
#
|
165
|
+
# parameter:
|
166
|
+
# :data
|
167
|
+
# string that will be cleaned
|
168
|
+
# :include_trashwords
|
169
|
+
# remove Words like German or Dubbed from
|
170
|
+
# the string (Trashwords)
|
171
|
+
# :repair_umlauts
|
172
|
+
# try to repair broken umlauts if they occur
|
173
|
+
#
|
174
|
+
def self.clean_episode_data(data, include_trashwords=false, repair_umlauts=false)
|
175
|
+
data.gsub!(/\./, " ")
|
176
|
+
data.gsub!(/\_/, " ")
|
177
|
+
data.gsub!(/\-/, " ")
|
178
|
+
|
179
|
+
# if this feature is enabled than all trash words
|
180
|
+
# are removed from the string. If two trashwords
|
181
|
+
# occur than all trailing words will be removed.
|
182
|
+
# if a word is removed and the next is not a trash
|
183
|
+
# word than the removed word will be included
|
184
|
+
if include_trashwords
|
185
|
+
purge_count= 0
|
186
|
+
last_purge = nil
|
187
|
+
cleanwords = []
|
188
|
+
|
189
|
+
for word in data.split(/ /) do
|
190
|
+
next unless word.match(/\w+/)
|
191
|
+
|
192
|
+
word = repair_umlauts(word) if repair_umlauts
|
193
|
+
|
194
|
+
# if word is in TRASH_WORDS
|
195
|
+
if ! @@TRASH_WORDS.grep(/^#{word}$/i).empty?
|
196
|
+
purge_count += 1
|
197
|
+
last_purge = word
|
198
|
+
|
199
|
+
break if purge_count == 2;
|
200
|
+
else
|
201
|
+
if purge_count == 1 && last_purge != nil
|
202
|
+
cleanwords.push(last_purge)
|
203
|
+
purge_count = 0
|
204
|
+
end
|
205
|
+
cleanwords.push(word)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
data = cleanwords.join(" ")
|
209
|
+
end
|
210
|
+
|
211
|
+
return data
|
212
|
+
end
|
213
|
+
|
214
|
+
# This method tries to repair some german umlauts so that
|
215
|
+
# the following occurs
|
216
|
+
#
|
217
|
+
# ae => ä ; ue => ü ; oe => ö ; Ae => Ä ; Ue => Ü ; Oe => Ö
|
218
|
+
#
|
219
|
+
# This method uses a webservice at:
|
220
|
+
# http://wortschatz.uni-leipzig.de/
|
221
|
+
# which produces statistics about the german language and
|
222
|
+
# e.g. frequency of words occuring in the german language
|
223
|
+
#
|
224
|
+
# this method convert all broken umlauts in the word and compares
|
225
|
+
# the frequency of both version and uses the version which is more
|
226
|
+
# common
|
227
|
+
#
|
228
|
+
# returns an repaired version of the word if necessary
|
229
|
+
def self.repair_umlauts(word)
|
230
|
+
|
231
|
+
if contains_eventual_broken_umlauts?(word)
|
232
|
+
|
233
|
+
repaired = word.gsub(/ae/, 'ä').gsub(/ue/, 'ü').gsub(/oe/, 'ö')
|
234
|
+
repaired.gsub!(/^Ae/, 'Ä')
|
235
|
+
repaired.gsub!(/^Ue/, 'Ü')
|
236
|
+
repaired.gsub!(/^Oe/, 'Ö')
|
237
|
+
|
238
|
+
ws = WLAPI::API.new
|
239
|
+
|
240
|
+
res_broken = ws.frequencies(word)
|
241
|
+
freq_broken = res_broken.nil? ? -1 : res_broken[0].to_i
|
242
|
+
|
243
|
+
res_repaired = ws.frequencies(repaired)
|
244
|
+
freq_repaired = res_repaired.nil? ? -1 : res_repaired[0].to_i
|
245
|
+
|
246
|
+
if freq_repaired > freq_broken
|
247
|
+
return repaired
|
248
|
+
end
|
249
|
+
end
|
250
|
+
return word
|
251
|
+
end
|
252
|
+
|
253
|
+
# checks for eventual broken umlauts
|
254
|
+
#
|
255
|
+
# returns true if broken umlaut if included
|
256
|
+
def self.contains_eventual_broken_umlauts?(string)
|
257
|
+
! string.match(/ae|ue|oe|Ae|Ue|Oe/).nil?
|
258
|
+
end
|
259
|
+
|
260
|
+
# tries to match the given string against
|
261
|
+
# all supported regex-patterns and returns true if a
|
262
|
+
# suitable regex is found
|
263
|
+
def self.contains_episode_information?(info)
|
264
|
+
@@PATTERNS.each do |p|
|
265
|
+
if info.match(p)
|
266
|
+
return true
|
267
|
+
end
|
268
|
+
end
|
269
|
+
return false
|
270
|
+
end
|
271
|
+
|
272
|
+
# tries to find a suitable pattern and returns
|
273
|
+
# the matched data or nil if nothing matched
|
274
|
+
def self.extract_episode_information(info)
|
275
|
+
pattern = @@PATTERNS.select { |p| ! info.match(p).nil? }[0]
|
276
|
+
if pattern
|
277
|
+
return pattern.match(info)
|
278
|
+
end
|
279
|
+
|
280
|
+
return nil
|
281
|
+
end
|
282
|
+
|
283
|
+
# tries to find a valid video file in a given path.
|
284
|
+
#
|
285
|
+
# If path is a file it returns path unchanged if file
|
286
|
+
# is a valid video file or nil unless
|
287
|
+
#
|
288
|
+
# If path is a dir it searches for the biggest valid
|
289
|
+
# videofile in it and returns the path or nil if nothing
|
290
|
+
# found
|
291
|
+
def self.determine_video_file(path)
|
292
|
+
if File.file?(path)
|
293
|
+
matched_endings = @@ENDINGS.select { |e| ! path.match(/#{e}$/).nil? }
|
294
|
+
return path if ! matched_endings.empty?
|
295
|
+
|
296
|
+
elsif File.directory?(path)
|
297
|
+
videofile = nil
|
298
|
+
for file in Find.find(path) do
|
299
|
+
matched_endings = @@ENDINGS.select { |e| ! file.match(/#{e}$/).nil? }
|
300
|
+
if ! matched_endings.empty?
|
301
|
+
if videofile == nil || File.size(file) > File.size(videofile)
|
302
|
+
videofile = file
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
return videofile if videofile
|
308
|
+
end
|
309
|
+
|
310
|
+
return nil
|
311
|
+
end
|
312
|
+
end
|
313
|
+
end
|