RubyGems - serienrenamer - Versions diffs - 0.0.1 - Mend

serienrenamer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/.gemtest +0 -0
data/History.txt +4 -0
data/Manifest.txt +23 -0
data/README.rdoc +52 -0
data/Rakefile +27 -0
data/bin/serienrenamer +155 -0
data/lib/plugin/serienjunkies_de.rb +129 -0
data/lib/plugin/serienjunkies_feed.rb +105 -0
data/lib/plugin/textfile.rb +50 -0
data/lib/plugin/wikipedia.rb +362 -0
data/lib/plugin.rb +8 -0
data/lib/serienrenamer/episode.rb +313 -0
data/lib/serienrenamer.rb +28 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/serienrenamer.gemspec +52 -0
data/test/serienjunkies_feed_sample.xml +14472 -0
data/test/test_episode.rb +188 -0
data/test/test_helper.rb +4 -0
data/test/test_plugin_serienjunkies_de.rb +95 -0
data/test/test_plugin_serienjunkies_feed.rb +75 -0
data/test/test_plugin_textfile.rb +38 -0
data/test/test_plugin_wikipedia.rb +178 -0
metadata +161 -0

data/lib/plugin/wikipedia.rb ADDED Viewed

@@ -0,0 +1,362 @@
+# encoding: UTF-8
+require 'media_wiki'
+module Plugin
+    # This Plugin tries to extract the series
+    # information from wikipedia
+    #
+    # (by now only the german wikipedia)
+    class Wikipedia < Serienrenamer::Pluginbase
+        def self.plugin_name; "Wikipedia" end
+        def self.usable; true end
+        def self.priority; 5 end
+        @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
+        # patterns used in this class
+        @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
+        @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
+        @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
+        @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
+        @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
+        @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
+        @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
+        @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
+        # this method will be called from the main program
+        # with an Serienrenamer::Episode instance as parameter
+        #
+        # it returns an array of episode information
+        def self.generate_episode_information(episode)
+            raise ArgumentError, "Serienrenamer::Episode instance needed" unless
+                episode.is_a?(Serienrenamer::Episode)
+            return [] unless episode.series.match(/\w+/)
+            unless defined? @cached_data
+                @cached_data = Hash.new
+            end
+            wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
+            if ! @cached_data.has_key?(episode.series)
+                # search for a series site in wikipedia
+                series_site = nil
+                tries = 3
+                search_pattern = episode.series
+                search_pattern_modified = false
+                begin
+                    wiki.search(search_pattern, nil, 50).each do |title|
+                        pagedata = wiki.get(title)
+                        if is_series_main_page?(pagedata)
+                            series_site = title
+                            break
+                        end
+                    end
+                    # modify the search term pattern so that it contains
+                    # only the last word if the search_pattern contains
+                    # more than one words
+                    if series_site.nil? && ! search_pattern_modified
+                        search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
+                        search_pattern_modified = true
+                        raise EOFError if search_pattern # break out and retry
+                    end
+                rescue MediaWiki::APIError => e
+                    tries -= 1
+                    retry if tries > 0
+                rescue EOFError => e
+                    retry
+                end
+                return [] unless series_site
+                # look for a link to a list of episodes
+                pagedata = wiki.get(series_site)
+                if contains_link_to_episode_list?(pagedata)
+                    mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
+                    if mainarticle
+                        episodelist_page = wiki.get(mainarticle)
+                        series = parse_episodelist_page_data(episodelist_page)
+                        @cached_data[episode.series] = series
+                    end
+                elsif contains_inarticle_episode_list?(pagedata)
+                    series = parse_inarticle_episodelist_page_data(pagedata)
+                    @cached_data[episode.series] = series
+                else
+                    warn "no episode list found"
+                    return []
+                end
+            end
+            episode_names = []
+            # tries to find an episodename in cached_data
+            # otherwise returns empty array
+            begin
+                series = @cached_data[episode.series]
+                episodename = series[episode.season][episode.episode]
+                if episodename.match(/\w+/)
+                    episode_names.push(episodename)
+                end
+            rescue
+            end
+            return episode_names
+        end
+        # This method will extract season based information
+        # from a string that contains a wikipedia episodelist page
+        #
+        # returns an Array of Arrays with episode information
+        # where episode and season numbers are the indizes
+        def self.parse_episodelist_page_data(pagedata, debug=false)
+            raise ArgumentError, 'String with pagedata expected' unless
+                pagedata.is_a?(String)
+            series_data = []
+            is_season_table_following = false
+            season_number = nil
+            # split the wikipedia page by headings and process
+            # the following paragraph if the heading starts with
+            # 'Staffel'
+            pagedata.split(/(==.*)==/).each do |paragraph|
+                if paragraph.match(/^==.*Staffel/)
+                    match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
+                    if match
+                        season_number = match[:seasonnr].to_i
+                        is_season_table_following = true
+                    end
+                elsif is_season_table_following
+                    #
+                    # extract season table from this paragraph
+                    season = parse_season_table(paragraph)
+                    series_data[season_number] = season
+                    is_season_table_following = false
+                end
+            end
+            return series_data
+        end
+        # this method will be called with a wikipedia seasontable
+        # as parameter and will extract all episodes from this
+        # and returns that as an array where the episode number is
+        # the index
+        def self.parse_season_table(table)
+            raise ArgumentError, 'String with seasontable expected' unless
+                table.is_a?(String)
+            season_data = []
+            matched_table = table.match(@@EPISODE_TABLE_PATTERN)
+            if matched_table
+                # extract all episode entries that
+                # looks like the following
+                #
+                # {{Episodenlisteneintrag
+                # | NR_GES = 107
+                # | NR_ST = 1
+                # | OT = The Mastodon in the Room
+                # | DT = Die Rückkehr der Scheuklappen
+                # | ZF =
+                # | EA = {{dts|23|09|2010}}
+                # | EAD = {{dts|08|09|2011}}
+                # }}
+                episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
+                if episodes
+                    episodes.each do |epi|
+                        # build up a hash from the entry
+                        infos = {}
+                        epi.lines.each do |part|
+                            parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
+                            if parts
+                                infos[parts[:key].strip] = parts[:value].strip
+                            end
+                        end
+                        next unless infos.has_key?('NR_ST')
+                        # extract useful information and
+                        # add it to the array
+                        epi_nr = infos['NR_ST'].to_i
+                        next unless epi_nr
+                        # TODO make the following variable
+                        epi_name = infos['DT'].strip
+                        # remove all html tags and all following
+                        # text from the episode name and the bold
+                        # syntax from mediawiki [[text]]
+                        epi_name.gsub!(/<\/?[^>]*>.*/, "")
+                        epi_name.gsub!(/[\[\[\]\]]/, "")
+                        next unless epi_name.match(/\w+/)
+                        season_data[epi_nr] = epi_name
+                    end
+                end
+            end
+            return season_data
+        end
+        # This method will extract season based information
+        # from a string that contains a series page with an
+        # episodelist included
+        #
+        # returns an Array of Arrays with episode information
+        # where episode and season numbers are the indizes
+        def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
+            raise ArgumentError, 'String with pagedata expected' unless
+                pagedata.is_a?(String)
+            series_data = []
+            # look for a paragraph with an episodelist
+            episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
+                contains_inarticle_episode_list?(p) }[0]
+            raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
+            # iterate through all seasons in this episode table
+            episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
+                next unless contains_inarticle_episode_list?(season)
+                season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
+                wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
+                episodes = parse_inarticle_season_table(wikitable)
+                # HACK if a season is splitted into different parts
+                # eg. Flashpoint (2.1 and 2.2) than merge that if possible
+                if series_data[season_nr] != nil
+                    series_data[season_nr].each_with_index do |item, index|
+                        episodes[index] = item unless episodes[index]
+                    end
+                end
+                series_data[season_nr] = episodes
+            end
+            return series_data
+        end
+        # this method will be called with a wikitable for a season
+        # as parameter and will extract all episodes from this
+        # and returns that as an array where the episode number is
+        # the index
+        #
+        # Example for an wikitable for episodes:
+        #
+        # {| class="wikitable" width="100%"
+        # |- vertical-align: top; text-align:center; "
+        # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
+        # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
+        # ! width="250" | Originaltitel
+        # ! width="250" | Deutscher Titel
+        # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
+        # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
+        # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
+        # |-
+        # | bgcolor="#DFEEEF"| 01
+        # | 01
+        # | ''Pilot''
+        # | ''Auch Reiche sind nur Menschen''
+        # | 4. Mai 2009
+        # | 17. Mai 2011
+        # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
+        # |-
+        # |}
+        #
+        def self.parse_inarticle_season_table(table)
+            raise ArgumentError, 'String with seasontable expected' unless
+                table.is_a?(String)
+            season_data = []
+            episode_nr_line_nr   = nil
+            episode_name_line_nr = nil
+            table.split(/^\|\-.*$/).each do |tablerow|
+                tablerow.strip!
+                # skip invalid rows
+                lines = tablerow.lines.to_a
+                next unless lines.length >= 4
+                if tablerow.match(/width=\"\d+\"/)
+                    # extract line numbers for needed data that
+                    # are in the table header
+                    lines.each_with_index do |item, index|
+                        if item.match(/Nummer.*Staffel/i)
+                            episode_nr_line_nr = index
+                        # TODO make the following more variable
+                        elsif item.match(/Deutscher.*Titel/i)
+                            episode_name_line_nr = index
+                        end
+                    end
+                else
+                    # extract episode information
+                    if episode_nr_line_nr && episode_name_line_nr
+                        md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
+                        if md_nr
+                            episode_nr = md_nr[1].to_i
+                            md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
+                            if md_name
+                                episode_name = md_name[1]
+                                episode_name.gsub!(/[\'\"\[\]]/, "")
+                                next unless episode_name.match(/\w+/)
+                                season_data[episode_nr] = episode_name.strip
+                            end
+                        end
+                    end
+                end
+            end
+            return season_data
+        end
+        # this method checks if the page is the main page
+        # for a series
+        #
+        # returns true if page contains the infobox that
+        # is typical for series pages in wikipedia
+        def self.is_series_main_page?(page)
+            page.match(@@SERIES_SITE_TEST_PATTERN) != nil
+        end
+        # check the site if it is a disambiguation site
+        #
+        # returns true if this site links to pages with
+        # themes with the same name
+        def self.is_disambiguation_site?(page)
+            page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
+        end
+        # test if the page contains a link to an article
+        # with an episode list
+        def self.contains_link_to_episode_list?(page)
+            page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
+        end
+        # test if the page contains a episode list
+        def self.contains_inarticle_episode_list?(page)
+            page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
+        end
+    end
+end

data/lib/plugin.rb ADDED Viewed

@@ -0,0 +1,8 @@
+$:.unshift(File.dirname(__FILE__)) unless
+  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+module Plugin
+    Dir[File.dirname(__FILE__) + '/plugin/*.rb'].each {|file| require file }
+end

data/lib/serienrenamer/episode.rb ADDED Viewed

@@ -0,0 +1,313 @@
+# coding: UTF-8
+require 'find'
+require 'fileutils'
+require 'wlapi'
+module Serienrenamer
+    class Episode
+        attr_reader :season, :episode, :episodename,
+            :extension, :episodepath, :success, :source_directory
+        attr_accessor :episodename_needed, :series
+        # patterns for suitable episodes
+        @@PATTERNS = [
+            # S01E01
+            /^(?<series>.*)S(?<season>\d+)E(?<episode>\d+)(?<episodename>.*)$/i,
+            # 101; 1212
+            /^(?<series>.*\D)(?<season>\d+)(?<episode>\d{2})(?<episodename>\W*.*)$/,
+            # 1x1; 12x12
+            /^(?<series>.*)(?<season>\d+)x(?<episode>\d+)(?<episodename>.*)$/,
+        ]
+        # allowed endings for episode files
+        @@ENDINGS = %w( mpg mpeg avi mkv wmv mp4 mov flv 3gp )
+        # trash words that are removed from the episodename
+        @@TRASH_WORDS = %w(
+            German Dubbed DVDRip HDTVRip XviD ITG TVR inspired HDRip
+            AMBiTiOUS RSG SiGHT SATRip WS TVS RiP READ GERMAN dTV aTV
+            iNTERNAL CRoW MSE c0nFuSed UTOPiA scum EXPiRED BDRiP HDTV
+            iTunesHD 720p x264 h264 CRiSP euHD WEBRiP ZZGtv ARCHiV DD20
+            Prim3time Nfo Repack SiMPTY BLURAYRiP BluRay DELiCiOUS Synced
+            UNDELiCiOUS fBi CiD iTunesHDRip RedSeven OiNK idTV DL DD51
+        )
+        # Constructor for the Episode-Class, which takes an episode as
+        # argument and extracts as much as information from the file
+        # that it can.
+        def initialize(episodepath, episodename_needed=true)
+            raise ArgumentError, 'no episodepath provided' unless episodepath
+            # make some checks on the given episode path
+            unless File.exists?(episodepath) || Dir.exists?(episodepath)
+                raise ArgumentError, "episodepath not existing"
+            end
+            unless Episode.determine_video_file(episodepath)
+                raise ArgumentError, 'no videofile found'
+            end
+            @source_directory = nil
+            # normalize information for dirs/files
+            basepath = File.basename(episodepath)
+            if File.file?(episodepath)
+                basepath = basepath.chomp(File.extname(basepath))
+            elsif File.directory?(episodepath)
+                @source_directory = episodepath
+                # if directory does not contain episode information
+                # check for an text file with suitable information
+                unless Episode.contains_episode_information?(basepath)
+                    info = Plugin::Textfile.generate_episode_information(episodepath)[0]
+                    basepath = info if info
+                end
+            end
+            unless Episode.contains_episode_information?(basepath)
+                raise ArgumentError, 'no episode information existing'
+            end
+            @episodepath = Episode.determine_video_file(episodepath)
+            infos = Episode.extract_episode_information(basepath)
+            raise ArgumentError, 'no suitable regex pattern matches' unless infos
+            @series = Episode.clean_episode_data(infos[:series]).strip
+            @episodename = Episode.clean_episode_data(
+                infos[:episodename], true, true).strip
+            @season = infos[:season].to_i
+            @episode = infos[:episode].to_i
+            # setting up special behaviour
+            @episodename_needed=episodename_needed
+            @extension=File.extname(@episodepath).gsub('.','')
+            @success=false
+        end
+        # Returns the episode information into a format like
+        # S0xE0x, depending on @episodename_needed it includes
+        # the episodename
+        def to_s
+            if @episodename_needed
+                return "S%.2dE%.2d - %s.%s" % [ @season, @episode, @episodename, @extension ]
+            else
+                return "S%.2dE%.2d.%s" % [ @season, @episode, @extension ]
+            end
+        end
+        # this method makes it possible to set the episodename
+        # afterwards
+        #
+        # options:
+        #   :data
+        #           string that contains epissodename information
+        #   :need_cleanup
+        #           if true than it will apply the standard regex
+        #           to clean the string and extracts that with
+        #           the standard patterns
+        #           if false the string will applied without any
+        #           checks or cleanup
+        #   :extract_seriesname
+        #           tries to extract the seriesname from data
+        def add_episode_information(data, need_cleanup=true, extract_seriesname=false)
+            return unless data
+            if need_cleanup
+                if Episode.contains_episode_information?(data)
+                    infos = Episode.extract_episode_information(data)
+                    if infos
+                        data = infos[:episodename]
+                        # try to extract seriesname if needed
+                        if extract_seriesname and infos[:series].match(/\w+/)
+                            seriesname = Episode.clean_episode_data(infos[:series])
+                            @series = seriesname.strip
+                        end
+                    end
+                end
+                data = Episode.clean_episode_data(data, true, true).strip
+            end
+            @episodename = data
+        end
+        # renames the given episodefile into the new
+        # clean format and sets the status on success
+        #
+        def rename(destination_dir=".")
+            raise IOError, 'episode file not existing' unless File.file?(@episodepath)
+            destination_file = File.join(destination_dir, self.to_s)
+            begin
+                File.rename(@episodepath, destination_file)
+                if @source_directory
+                    FileUtils.remove_dir(@source_directory)
+                end
+                @success = true
+            rescue SystemCallError => e
+                puts "Rename failed: #{e}"
+            end
+        end
+        ##################
+        # static methods #
+        ##################
+        # cleans strings from things that can occur in
+        # episode files like dots (.) and trash words
+        #
+        # parameter:
+        #   :data
+        #       string that will be cleaned
+        #   :include_trashwords
+        #       remove Words like German or Dubbed from
+        #       the string (Trashwords)
+        #   :repair_umlauts
+        #       try to repair broken umlauts if they occur
+        #
+        def self.clean_episode_data(data, include_trashwords=false, repair_umlauts=false)
+            data.gsub!(/\./, " ")
+            data.gsub!(/\_/, " ")
+            data.gsub!(/\-/, " ")
+            # if this feature is enabled than all trash words
+            # are removed from the string. If two trashwords
+            # occur than all trailing words will be removed.
+            # if a word is removed and the next is not a trash
+            # word than the removed word will be included
+            if include_trashwords
+                purge_count= 0
+                last_purge = nil
+                cleanwords = []
+                for word in data.split(/ /) do
+                    next unless word.match(/\w+/)
+                    word = repair_umlauts(word) if repair_umlauts
+                    # if word is in TRASH_WORDS
+                    if ! @@TRASH_WORDS.grep(/^#{word}$/i).empty?
+                        purge_count += 1
+                        last_purge = word
+                        break if purge_count == 2;
+                    else
+                        if purge_count == 1 && last_purge != nil
+                            cleanwords.push(last_purge)
+                            purge_count = 0
+                        end
+                        cleanwords.push(word)
+                    end
+                end
+                data = cleanwords.join(" ")
+            end
+            return data
+        end
+        # This method tries to repair some german umlauts so that
+        # the following occurs
+        #
+        # ae => ä ; ue => ü ; oe => ö ; Ae => Ä ; Ue => Ü ; Oe => Ö
+        #
+        # This method uses a webservice at:
+        #   http://wortschatz.uni-leipzig.de/
+        # which produces statistics about the german language and
+        # e.g. frequency of words occuring in the german language
+        #
+        # this method convert all broken umlauts in the word and compares
+        # the frequency of both version and uses the version which is more
+        # common
+        #
+        # returns an repaired version of the word if necessary
+        def self.repair_umlauts(word)
+            if contains_eventual_broken_umlauts?(word)
+                repaired = word.gsub(/ae/, 'ä').gsub(/ue/, 'ü').gsub(/oe/, 'ö')
+                repaired.gsub!(/^Ae/, 'Ä')
+                repaired.gsub!(/^Ue/, 'Ü')
+                repaired.gsub!(/^Oe/, 'Ö')
+                ws = WLAPI::API.new
+                res_broken  = ws.frequencies(word)
+                freq_broken = res_broken.nil? ? -1 : res_broken[0].to_i
+                res_repaired  = ws.frequencies(repaired)
+                freq_repaired = res_repaired.nil? ? -1 : res_repaired[0].to_i
+                if freq_repaired > freq_broken
+                    return repaired
+                end
+            end
+            return word
+        end
+        # checks for eventual broken umlauts
+        #
+        # returns true if broken umlaut if included
+        def self.contains_eventual_broken_umlauts?(string)
+            ! string.match(/ae|ue|oe|Ae|Ue|Oe/).nil?
+        end
+        # tries to match the given string against
+        # all supported regex-patterns and returns true if a
+        # suitable regex is found
+        def self.contains_episode_information?(info)
+            @@PATTERNS.each do |p|
+                if info.match(p)
+                    return true
+                end
+            end
+            return false
+        end
+        # tries to find a suitable pattern and returns
+        # the matched data or nil if nothing matched
+        def self.extract_episode_information(info)
+            pattern = @@PATTERNS.select { |p| ! info.match(p).nil? }[0]
+            if pattern
+                return pattern.match(info)
+            end
+            return nil
+        end
+        # tries to find a valid video file in a given path.
+        #
+        # If path is a file it returns path unchanged if file
+        # is a valid video file or nil unless
+        #
+        # If path is a dir it searches for the biggest valid
+        # videofile in it and returns the path or nil if nothing
+        # found
+        def self.determine_video_file(path)
+            if File.file?(path)
+                matched_endings = @@ENDINGS.select { |e| ! path.match(/#{e}$/).nil? }
+                return path if ! matched_endings.empty?
+            elsif File.directory?(path)
+                videofile = nil
+                for file in Find.find(path) do
+                    matched_endings = @@ENDINGS.select { |e| ! file.match(/#{e}$/).nil? }
+                    if ! matched_endings.empty?
+                        if videofile == nil || File.size(file) > File.size(videofile)
+                            videofile = file
+                        end
+                    end
+                end
+                return videofile if videofile
+            end
+            return nil
+        end
+    end
+end