RubyGems - serienrenamer - Versions diffs - 0.0.14 → 0.0.15 - Mend

serienrenamer 0.0.14 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/bin/serienrenamer +0 -6
data/lib/serienrenamer/plugin/episode_identifier.rb +35 -0
data/lib/serienrenamer/plugin/serienjunkies_de.rb +131 -0
data/lib/serienrenamer/plugin/serienjunkies_feed.rb +112 -0
data/lib/serienrenamer/plugin/serienjunkies_org.rb +181 -0
data/lib/serienrenamer/plugin/textfile.rb +57 -0
data/lib/serienrenamer/plugin/wikipedia.rb +448 -0
data/lib/serienrenamer/plugin.rb +26 -0
data/lib/serienrenamer/version.rb +1 -1
data/lib/serienrenamer.rb +4 -25
data/serienrenamer.gemspec +1 -0
data/test/test_helper.rb +1 -1
data/test/test_plugin_episode_identifier.rb +1 -1
data/test/test_plugin_serienjunkies_de.rb +3 -3
data/test/test_plugin_serienjunkies_feed.rb +1 -1
data/test/{test_serienjunkies_org.rb → test_plugin_serienjunkies_org.rb} +3 -3
data/test/test_plugin_textfile.rb +3 -3
metadata +12 -12
data/lib/plugin/episode_identifier.rb +0 -33
data/lib/plugin/serienjunkies_de.rb +0 -129
data/lib/plugin/serienjunkies_feed.rb +0 -110
data/lib/plugin/serienjunkies_org.rb +0 -179
data/lib/plugin/textfile.rb +0 -54
data/lib/plugin/wikipedia.rb +0 -446
data/lib/plugin.rb +0 -8

data/lib/plugin/wikipedia.rb DELETED Viewed

@@ -1,446 +0,0 @@
-# encoding: UTF-8
-require 'media_wiki'
-module Plugin
-    # This Plugin tries to extract the series
-    # information from wikipedia
-    #
-    # (by now only the german wikipedia)
-    class Wikipedia < Serienrenamer::Pluginbase
-        def self.plugin_name; "Wikipedia" end
-        def self.usable; true end
-        def self.priority; 30 end
-        @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
-        # patterns used in this class
-        @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
-        @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
-        @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
-        @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
-        @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
-        @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
-        @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
-        @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
-        @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
-        # this method will be called from the main program
-        # with an Serienrenamer::Episode instance as parameter
-        #
-        # it returns an array of episode information
-        def self.generate_episode_information(episode)
-            raise ArgumentError, "Serienrenamer::Episode instance needed" unless
-                episode.is_a?(Serienrenamer::Episode)
-            return [] unless episode.series.match(/\w+/)
-            unless defined? @cached_data
-                @cached_data = Hash.new
-            end
-            wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
-            if ! @cached_data.has_key?(episode.series)
-                # search for a series site in wikipedia
-                series_site = nil
-                tries = 3
-                search_pattern = episode.series
-                search_pattern_modified = false
-                begin
-                    wiki.search(search_pattern, nil, 15).each do |title|
-                        pagedata = wiki.get(title)
-                        if is_series_main_page?(pagedata)
-                            series_site = title
-                            break
-                        end
-                    end
-                    # modify the search term pattern so that it contains
-                    # only the last word if the search_pattern contains
-                    # more than one words
-                    if series_site.nil? && ! search_pattern_modified
-                        search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
-                        search_pattern_modified = true
-                        raise EOFError if search_pattern # break out and retry
-                    end
-                rescue MediaWiki::APIError => e
-                    tries -= 1
-                    retry if tries > 0
-                rescue EOFError => e
-                    retry
-                end
-                return [] unless series_site
-                # look for a link to a list of episodes
-                pagedata = wiki.get(series_site)
-                if contains_link_to_episode_list?(pagedata)
-                    mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
-                    if mainarticle
-                        episodelist_page = wiki.get(mainarticle)
-                        series = parse_episodelist_page_data(episodelist_page)
-                        @cached_data[episode.series] = series
-                    end
-                elsif contains_inarticle_episode_list?(pagedata)
-                    series = parse_inarticle_episodelist_page_data(pagedata)
-                    @cached_data[episode.series] = series
-                else
-                    warn "no episode list found"
-                    return []
-                end
-            end
-            episode_names = []
-            # tries to find an episodename in cached_data
-            # otherwise returns empty array
-            begin
-                series = @cached_data[episode.series]
-                episodename = series[episode.season][episode.episode]
-                if episodename.match(/\w+/)
-                    episode_names.push(episodename)
-                end
-            rescue
-            end
-            return episode_names
-        end
-        # This method will extract season based information
-        # from a string that contains a wikipedia episodelist page
-        #
-        # returns an Array of Arrays with episode information
-        # where episode and season numbers are the indizes
-        def self.parse_episodelist_page_data(pagedata, debug=false)
-            raise ArgumentError, 'String with pagedata expected' unless
-                pagedata.is_a?(String)
-            series_data = []
-            is_season_table_following = false
-            season_number = nil
-            # split the wikipedia page by headings and process
-            # the following paragraph if the heading starts with
-            # 'Staffel'
-            pagedata.split(/(==.*)==/).each do |paragraph|
-                if paragraph.match(/^==.*Staffel/)
-                    match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
-                    if match
-                        season_number = match[:seasonnr].to_i
-                        is_season_table_following = true
-                    end
-                elsif is_season_table_following
-                    #
-                    # extract season table from this paragraph
-                    season = parse_season_table(paragraph)
-                    series_data[season_number] = season
-                    is_season_table_following = false
-                end
-            end
-            return series_data
-        end
-        # this method will be called with a wikipedia seasontable
-        # as parameter and will extract all episodes from this
-        # and returns that as an array where the episode number is
-        # the index
-        def self.parse_season_table(table)
-            raise ArgumentError, 'String with seasontable expected' unless
-                table.is_a?(String)
-            season_data = []
-            matched_table = table.match(@@EPISODE_TABLE_PATTERN)
-            if matched_table
-                # extract all episode entries that
-                # looks like the following
-                #
-                # {{Episodenlisteneintrag
-                # | NR_GES = 107
-                # | NR_ST = 1
-                # | OT = The Mastodon in the Room
-                # | DT = Die Rückkehr der Scheuklappen
-                # | ZF =
-                # | EA = {{dts|23|09|2010}}
-                # | EAD = {{dts|08|09|2011}}
-                # }}
-                episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
-                if episodes
-                    episodes.each do |epi|
-                        # build up a hash from the entry
-                        infos = {}
-                        epi.lines.each do |part|
-                            parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
-                            if parts
-                                infos[parts[:key].strip] = parts[:value].strip
-                            end
-                        end
-                        next unless infos.has_key?('NR_ST')
-                        # extract useful information and
-                        # add it to the array
-                        epi_nr = infos['NR_ST'].to_i
-                        next unless epi_nr
-                        # TODO make the following variable
-                        epi_name = infos['DT'].strip
-                        # remove all html tags and all following
-                        # text from the episode name and the bold
-                        # syntax from mediawiki [[text]]
-                        epi_name.gsub!(/<\/?[^>]*>.*/, "")
-                        epi_name.gsub!(/[\[\[\]\]]/, "")
-                        next unless epi_name.match(/\w+/)
-                        season_data[epi_nr] = epi_name
-                    end
-                end
-            end
-            return season_data
-        end
-        # This method will extract season based information
-        # from a string that contains a series page with an
-        # episodelist included
-        #
-        # returns an Array of Arrays with episode information
-        # where episode and season numbers are the indizes
-        def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
-            raise ArgumentError, 'String with pagedata expected' unless
-                pagedata.is_a?(String)
-            series_data = []
-            # look for a paragraph with an episodelist
-            episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
-                contains_inarticle_episode_list?(p) }[0]
-            raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
-            # iterate through all seasons in this episode table
-            episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
-                next unless contains_inarticle_episode_list?(season)
-                season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
-                wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
-                # we have to detect the type of the inarticle season page
-                # because there are two different kinds of table structures
-                # used in the german wikipedia
-                if self.is_episode_list_with_one_episode_per_line?(wikitable)
-                    episodes = parse_inarticle_season_table_with_one_line(wikitable)
-                else
-                    episodes = parse_inarticle_season_table(wikitable)
-                end
-                # HACK if a season is splitted into different parts
-                # eg. Flashpoint (2.1 and 2.2) than merge that if possible
-                if series_data[season_nr] != nil
-                    series_data[season_nr].each_with_index do |item, index|
-                        episodes[index] = item unless episodes[index]
-                    end
-                end
-                series_data[season_nr] = episodes
-            end
-            return series_data
-        end
-        # this method will be called with a wikitable for a season
-        # as parameter and will extract all episodes from this
-        # and returns that as an array where the episode number is
-        # the index
-        #
-        # Example for an wikitable for episodes:
-        #
-        # {| class="wikitable" width="100%"
-        # |- vertical-align: top; text-align:center; "
-        # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
-        # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
-        # ! width="250" | Originaltitel
-        # ! width="250" | Deutscher Titel
-        # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
-        # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
-        # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
-        # |-
-        # | bgcolor="#DFEEEF"| 01
-        # | 01
-        # | ''Pilot''
-        # | ''Auch Reiche sind nur Menschen''
-        # | 4. Mai 2009
-        # | 17. Mai 2011
-        # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
-        # |-
-        # |}
-        #
-        def self.parse_inarticle_season_table(table)
-            raise ArgumentError, 'String with seasontable expected' unless
-                table.is_a?(String)
-            season_data = []
-            episode_nr_line_nr   = nil
-            episode_name_line_nr = nil
-            table.split(/^\|\-.*$/).each do |tablerow|
-                tablerow.strip!
-                # skip invalid rows
-                lines = tablerow.lines.to_a
-                next unless lines.length >= 4
-                if tablerow.match(/width=\"\d+\"/)
-                    # extract line numbers for needed data that
-                    # are in the table header
-                    lines.each_with_index do |item, index|
-                        if item.match(/Nummer.*Staffel/i)
-                            episode_nr_line_nr = index
-                        # TODO make the following more variable
-                        elsif item.match(/Deutscher.*Titel/i)
-                            episode_name_line_nr = index
-                        end
-                    end
-                else
-                    # extract episode information
-                    if episode_nr_line_nr && episode_name_line_nr
-                        md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
-                        if md_nr
-                            episode_nr = md_nr[1].to_i
-                            md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
-                            if md_name
-                                episode_name = md_name[1]
-                                episode_name.gsub!(/[\'\"\[\]]/, "")
-                                next unless episode_name.match(/\w+/)
-                                season_data[episode_nr] = episode_name.strip
-                            end
-                        end
-                    end
-                end
-            end
-            return season_data
-        end
-        # this method will be called with a wikitable for a season
-        # as parameter and will extract all episodes from this
-        # and returns that as an array where the episode number is
-        # the index
-        #
-        # this method implements a special format that takes place in
-        # e.g. 'Prison Break' where an episode is not spread along several
-        # lines like in the method above
-        #
-        # Example for an wikitable for episodes:
-        #
-        #{| class="wikitable"
-        # |- style="color:#black; background-color:#006699"
-        # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
-        # |-
-        # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
-        # |-
-        # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
-        # |-
-        # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
-        # |-
-        # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
-        #
-        def self.parse_inarticle_season_table_with_one_line(table)
-            raise ArgumentError, 'String with seasontable expected' unless
-                table.is_a?(String)
-            season_data = []
-            episode_nr_col   = nil
-            episode_name_col = nil
-            table.split(/^\|\-.*$/).each do |tablerow|
-                if tablerow.match(/!!.*!!.*!!/)
-                    # extract column numbers from table header
-                    tablerow.split(/!!/).each_with_index do |col,index|
-                        episode_nr_col   = index if col.match(/Episode/i)
-                        episode_name_col = index if col.match(/Deutsch.*Titel/i)
-                    end
-                elsif tablerow.match(/\|\|.*\w+.*\|\|/)
-                    tablerow.strip!
-                    columns = tablerow.split(/\|\|/)
-                    # the following cleanes up the column so that the following occurs
-                    # " '''7''' (1-07) " => "7     1 07"
-                    #
-                    # we can now extract the last bunch of digits and this algorithm is
-                    # some kind of format independent
-                    dirty_episode_nr   = columns[episode_nr_col].gsub(/\D/, " ").strip
-                    episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
-                    next unless episode_nr
-                    episode_name = columns[episode_name_col].strip
-                    next unless episode_nr.match(/\w+/)
-                    season_data[episode_nr.to_i] = episode_name
-                end
-            end
-            return season_data
-        end
-        # this method checks if the page is the main page
-        # for a series
-        #
-        # returns true if page contains the infobox that
-        # is typical for series pages in wikipedia
-        def self.is_series_main_page?(page)
-            page.match(@@SERIES_SITE_TEST_PATTERN) != nil
-        end
-        # check the site if it is a disambiguation site
-        #
-        # returns true if this site links to pages with
-        # themes with the same name
-        def self.is_disambiguation_site?(page)
-            page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
-        end
-        # test if the page contains a link to an article
-        # with an episode list
-        def self.contains_link_to_episode_list?(page)
-            page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
-        end
-        # test if the page contains a episode list
-        def self.contains_inarticle_episode_list?(page)
-            page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
-        end
-        # tests for the type of in article episode list
-        def self.is_episode_list_with_one_episode_per_line?(page)
-            page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
-        end
-    end
-end

data/lib/plugin.rb DELETED Viewed

@@ -1,8 +0,0 @@
-$:.unshift(File.dirname(__FILE__)) unless
-  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
-module Plugin
-    Dir[File.dirname(__FILE__) + '/plugin/*.rb'].each {|file| require file }
-end