RubyGems - serienrenamer - Versions diffs - 0.1.2 → 0.2.0 - Mend

serienrenamer 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.gitignore +0 -1
data/.travis.yml +2 -1
data/Gemfile +1 -1
data/Gemfile.lock +57 -0
data/README.rdoc +19 -23
data/bin/serienrenamer +7 -1
data/lib/serienrenamer/episode.rb +3 -59
data/lib/serienrenamer/plugin.rb +1 -0
data/lib/serienrenamer/plugin/umlauts.rb +70 -0
data/lib/serienrenamer/version.rb +1 -1
data/serienrenamer.gemspec +1 -3
data/test/test_episode.rb +64 -118
data/test/test_information_store.rb +16 -19
data/test/test_plugin_serienjunkies_feed.rb +36 -38
data/test/test_plugin_textfile.rb +11 -17
data/test/test_plugin_umlauts.rb +40 -0
metadata +18 -51
data/lib/serienrenamer/plugin/serienjunkies_de.rb +0 -131
data/lib/serienrenamer/plugin/serienjunkies_org.rb +0 -181
data/lib/serienrenamer/plugin/wikipedia.rb +0 -448
data/test/skipped_test_plugin_serienjunkies_org.rb +0 -84
data/test/skipped_test_plugin_wikipedia.rb +0 -177
data/test/test_plugin_serienjunkies_de.rb +0 -98

data/lib/serienrenamer/plugin/serienjunkies_org.rb DELETED Viewed

@@ -1,181 +0,0 @@
-#
-# Class that extracts information about episodes
-# from the serienjunkies.org-Page
-#
-require 'uri'
-require 'mechanize'
-require 'yaml'
-module Serienrenamer
-  module Plugin
-    class SerienjunkiesOrg < Serienrenamer::Pluginbase
-      def self.plugin_name; "SerienjunkiesOrg" end
-      def self.plugin_url; "http://serienjunkies.org" end
-      def self.usable; true end
-      def self.priority; 60 end
-      # Public: tries to search for an appropriate episodename
-      #
-      # if this is the first call to this method, it builds up
-      # a hash with all series and existing episodes, which can
-      # be used by all future method calls
-      #
-      # episode - Serienrenamer::Episode instance which holds the information
-      #
-      # Returns an array of possible episodenames
-      def self.generate_episode_information(episode)
-        raise ArgumentError, "Serienrenamer::Episode instance needed" unless
-        episode.is_a?(Serienrenamer::Episode)
-        unless defined? @cached_data
-          @cached_data = Hash.new
-        end
-        if ! @cached_data.has_key?(episode.series)
-          if episode.series.match(/\w+/)
-            # determine link to series
-            seriespage_link = self.find_link_to_series_page(episode.series)
-            if seriespage_link
-              seriesdata = self.parse_seriespage(seriespage_link)
-              @cached_data[episode.series] = seriesdata
-            end
-          end
-        end
-        matched_episodes = []
-        # tries to find an episodename in cached_data
-        # otherwise returns empty array
-        begin
-          series = @cached_data[episode.series]
-          identifier = "%d_%d" % [ episode.season, episode.episode ]
-          episodename = series[identifier]
-          if episodename.match(/\w+/)
-            matched_episodes.push(episodename)
-          end
-        rescue
-        end
-        return matched_episodes
-      end
-      # Public: tries to find a link to the seriespage
-      #
-      # seriesname  - the series name for which the page is searched
-      #
-      # Returns the link or nil
-      def self.find_link_to_series_page(seriesname)
-        raise ArgumentError, "seriesname expected" unless seriesname.match(/\w+/)
-        self.build_agent unless defined? @agent
-        url = URI.join(plugin_url, "?cat=0&l=%s" % seriesname[0].downcase )
-        pattern = seriesname.gsub(/\s/, ".*")
-        @agent.get(url).search("div#sidebar > ul > li > a").each do |series|
-          if series.text.match(/#{pattern}/i)
-            return URI.join( plugin_url, series[:href]).to_s
-          end
-        end
-        nil
-      end
-      # Public: parses a series page and extracts the episode information
-      #
-      # page_url   - the url to the seriespage
-      # german     - if true it extracts only german data (Defaults to true)
-      #
-      # Returns a hash which contains the episode information or an empty
-      # hash if there aren't any episodes
-      def self.parse_seriespage(page_url, german=true, debug=false)
-        self.build_agent unless defined? @agent
-        series = {}
-        doc = @agent.get(page_url)
-        doc.search('div#sidebar > div#scb > div.bkname > a').each do |link|
-          if german
-            next unless link.content.match(/Staffel/i)
-          else
-            next unless link.content.match(/Season/i)
-          end
-          site = @agent.get(link[:href])
-          episodes = self.parse_season_subpage(site, german)
-          series.merge!(episodes)
-        end
-        puts series.to_yaml if debug
-        return series
-      end
-      # Public: extracts the episodes from one season
-      #
-      # page   - Mechanize page object which holds the season
-      # german - extracts german or international episodes
-      #
-      # Returns a hash with all episodes (unique)
-      def self.parse_season_subpage(page, german=true)
-        episodes = {}
-        page.search('div.post > div.post-content strong:nth-child(1)').each do |e|
-          content =  e.content
-          md = Serienrenamer::Episode.extract_episode_information(content)
-          next unless md
-          if german
-            next unless content.match(/German/i)
-            next if content.match(/Subbed/i)
-          else
-            next if content.match(/German/i)
-          end
-          episodename =
-            Serienrenamer::Episode.clean_episode_data(md[:episodename], true)
-          next unless episodename && episodename.match(/\w+/)
-          id = "%d_%d" % [ md[:season].to_i, md[:episode].to_i ]
-          next if episodes[id] && episodes[id].size > episodename.size
-          episodes[id] = episodename
-        end
-        return episodes
-      end
-      private
-      # Private: constructs a Mechanize instance and adds a fix that interprets
-      #          every response as html
-      #
-      # Returns the agent
-      def self.build_agent
-        @agent = Mechanize.new do |a|
-          a.post_connect_hooks << lambda do |_,_,response,_|
-            if response.content_type.nil? || response.content_type.empty?
-              response.content_type = 'text/html'
-            end
-          end
-        end
-      end
-    end
-  end
-end

data/lib/serienrenamer/plugin/wikipedia.rb DELETED Viewed

@@ -1,448 +0,0 @@
-# encoding: UTF-8
-require 'media_wiki'
-module Serienrenamer
-  module Plugin
-    # This Plugin tries to extract the series
-    # information from wikipedia
-    #
-    # (by now only the german wikipedia)
-    class Wikipedia < Serienrenamer::Pluginbase
-      def self.plugin_name; "Wikipedia" end
-      def self.usable; true end
-      def self.priority; 30 end
-      @@WIKIPEDIA_URL = 'http://de.wikipedia.org/w/api.php'
-      # patterns used in this class
-      @@EPISODE_TABLE_PATTERN = /.*(?<table>\{\{Episodenlistentabelle.*\}\})\s*$/m
-      @@EPISODE_ENTRY_PATTERN = /\{\{Episodenlisteneintrag|S-Episode/
-      @@SERIES_SITE_TEST_PATTERN = /\{\{Infobox.Fernsehsendung.*\}\}/m
-      @@DISAMBIGUATION_TEST_PATTERN = /\{\{Begriffsklärung\}\}/m
-      @@CONTAINS_LINK_TO_EPISODE_LIST = /Hauptartikel.*(?<main>Liste.*?)[\]\}]+/
-      @@CONTAINS_INARTICLE_EPISODE_LIST = /\<div.*\>Staffel.(\d+).*\<\/div\>.*class=\"wikitable\".*titel/m
-      @@INPAGE_SEASON_SEPARATOR = /\<div.style=\"clear:both\;.class=\"NavFrame\"\>/
-      @@WIKITABLE_EXTRACT_PATTERN = /(\{\|.class=\"wikitable\".*\|\})\n/m
-      @@IS_ONE_LINE_EPISODE_LIST = /\|.*\|\|.*\|\|.*\|\|/m
-      # this method will be called from the main program
-      # with an Serienrenamer::Episode instance as parameter
-      #
-      # it returns an array of episode information
-      def self.generate_episode_information(episode)
-        raise ArgumentError, "Serienrenamer::Episode instance needed" unless
-          episode.is_a?(Serienrenamer::Episode)
-        return [] unless episode.series.match(/\w+/)
-        unless defined? @cached_data
-          @cached_data = Hash.new
-        end
-        wiki = MediaWiki::Gateway.new(@@WIKIPEDIA_URL)
-        if ! @cached_data.has_key?(episode.series)
-          # search for a series site in wikipedia
-          series_site = nil
-          tries = 3
-          search_pattern = episode.series
-          search_pattern_modified = false
-          begin
-            wiki.search(search_pattern, nil, 15).each do |title|
-              pagedata = wiki.get(title)
-              if is_series_main_page?(pagedata)
-                series_site = title
-                break
-              end
-            end
-            # modify the search term pattern so that it contains
-            # only the last word if the search_pattern contains
-            # more than one words
-            if series_site.nil? && ! search_pattern_modified
-              search_pattern = search_pattern.match(/(\w+)\s*$/)[1]
-              search_pattern_modified = true
-              raise EOFError if search_pattern # break out and retry
-            end
-          rescue MediaWiki::APIError => e
-            tries -= 1
-            retry if tries > 0
-          rescue EOFError => e
-            retry
-          end
-          return [] unless series_site
-          # look for a link to a list of episodes
-          pagedata = wiki.get(series_site)
-          if contains_link_to_episode_list?(pagedata)
-            mainarticle = pagedata.match(@@CONTAINS_LINK_TO_EPISODE_LIST)[:main]
-            if mainarticle
-              episodelist_page = wiki.get(mainarticle)
-              series = parse_episodelist_page_data(episodelist_page)
-              @cached_data[episode.series] = series
-            end
-          elsif contains_inarticle_episode_list?(pagedata)
-            series = parse_inarticle_episodelist_page_data(pagedata)
-            @cached_data[episode.series] = series
-          else
-            warn "no episode list found"
-            return []
-          end
-        end
-        episode_names = []
-        # tries to find an episodename in cached_data
-        # otherwise returns empty array
-        begin
-          series = @cached_data[episode.series]
-          episodename = series[episode.season][episode.episode]
-          if episodename.match(/\w+/)
-            episode_names.push(episodename)
-          end
-        rescue
-        end
-        return episode_names
-      end
-      # This method will extract season based information
-      # from a string that contains a wikipedia episodelist page
-      #
-      # returns an Array of Arrays with episode information
-      # where episode and season numbers are the indizes
-      def self.parse_episodelist_page_data(pagedata, debug=false)
-        raise ArgumentError, 'String with pagedata expected' unless
-        pagedata.is_a?(String)
-        series_data = []
-        is_season_table_following = false
-        season_number = nil
-        # split the wikipedia page by headings and process
-        # the following paragraph if the heading starts with
-        # 'Staffel'
-        pagedata.split(/(==.*)==/).each do |paragraph|
-          if paragraph.match(/^==.*Staffel/)
-            match = paragraph.match(/^==.*Staffel.(?<seasonnr>\d+)/)
-            if match
-              season_number = match[:seasonnr].to_i
-              is_season_table_following = true
-            end
-          elsif is_season_table_following
-            #
-            # extract season table from this paragraph
-            season = parse_season_table(paragraph)
-            series_data[season_number] = season
-            is_season_table_following = false
-          end
-        end
-        return series_data
-      end
-      # this method will be called with a wikipedia seasontable
-      # as parameter and will extract all episodes from this
-      # and returns that as an array where the episode number is
-      # the index
-      def self.parse_season_table(table)
-        raise ArgumentError, 'String with seasontable expected' unless
-        table.is_a?(String)
-        season_data = []
-        matched_table = table.match(@@EPISODE_TABLE_PATTERN)
-        if matched_table
-          # extract all episode entries that
-          # looks like the following
-          #
-          # {{Episodenlisteneintrag
-          # | NR_GES = 107
-          # | NR_ST = 1
-          # | OT = The Mastodon in the Room
-          # | DT = Die Rückkehr der Scheuklappen
-          # | ZF =
-          # | EA = {{dts|23|09|2010}}
-          # | EAD = {{dts|08|09|2011}}
-          # }}
-          episodes = matched_table[:table].split(@@EPISODE_ENTRY_PATTERN)
-          if episodes
-            episodes.each do |epi|
-              # build up a hash from the entry
-              infos = {}
-              epi.lines.each do |part|
-                parts = part.strip.match(/(?<key>\w+).=.(?<value>.*)$/)
-                if parts
-                  infos[parts[:key].strip] = parts[:value].strip
-                end
-              end
-              next unless infos.has_key?('NR_ST')
-              # extract useful information and
-              # add it to the array
-              epi_nr = infos['NR_ST'].to_i
-              next unless epi_nr
-              # TODO make the following variable
-              epi_name = infos['DT'].strip
-              # remove all html tags and all following
-              # text from the episode name and the bold
-              # syntax from mediawiki [[text]]
-              epi_name.gsub!(/<\/?[^>]*>.*/, "")
-              epi_name.gsub!(/[\[\[\]\]]/, "")
-              next unless epi_name.match(/\w+/)
-              season_data[epi_nr] = epi_name
-            end
-          end
-        end
-        return season_data
-      end
-      # This method will extract season based information
-      # from a string that contains a series page with an
-      # episodelist included
-      #
-      # returns an Array of Arrays with episode information
-      # where episode and season numbers are the indizes
-      def self.parse_inarticle_episodelist_page_data(pagedata, debug=false)
-        raise ArgumentError, 'String with pagedata expected' unless
-        pagedata.is_a?(String)
-        series_data = []
-        # look for a paragraph with an episodelist
-        episodelist_paragraph = pagedata.split(/==.*==/).select { |p|
-          contains_inarticle_episode_list?(p) }[0]
-          raise ArgumentError, 'no episodelist found' unless episodelist_paragraph
-          # iterate through all seasons in this episode table
-          episodelist_paragraph.split(@@INPAGE_SEASON_SEPARATOR).each do |season|
-            next unless contains_inarticle_episode_list?(season)
-            season_nr = season.match(@@CONTAINS_INARTICLE_EPISODE_LIST)[1].to_i
-            wikitable = season.match(@@WIKITABLE_EXTRACT_PATTERN)[1]
-            # we have to detect the type of the inarticle season page
-            # because there are two different kinds of table structures
-            # used in the german wikipedia
-            if self.is_episode_list_with_one_episode_per_line?(wikitable)
-              episodes = parse_inarticle_season_table_with_one_line(wikitable)
-            else
-              episodes = parse_inarticle_season_table(wikitable)
-            end
-            # HACK if a season is splitted into different parts
-            # eg. Flashpoint (2.1 and 2.2) than merge that if possible
-            if series_data[season_nr] != nil
-              series_data[season_nr].each_with_index do |item, index|
-                episodes[index] = item unless episodes[index]
-              end
-            end
-            series_data[season_nr] = episodes
-          end
-          return series_data
-      end
-      # this method will be called with a wikitable for a season
-      # as parameter and will extract all episodes from this
-      # and returns that as an array where the episode number is
-      # the index
-      #
-      # Example for an wikitable for episodes:
-      #
-      # {| class="wikitable" width="100%"
-      # |- vertical-align: top; text-align:center; "
-      # | width="15" | '''Nummer''' <br /><small>(Gesamt)<small>
-      # | width="15" | '''Nummer''' <br /><small>(Staffel)<small>
-      # ! width="250" | Originaltitel
-      # ! width="250" | Deutscher Titel
-      # ! width="180" | Erstausstrahlung<br /><small>(USA Network)</small>
-      # ! width="180" | Erstausstrahlung<br /><small>(RTL)</small>
-      # ! width="180" | Erstausstrahlung<br /><small>(SF zwei)</small>
-      # |-
-      # | bgcolor="#DFEEEF"| 01
-      # | 01
-      # | ''Pilot''
-      # | ''Auch Reiche sind nur Menschen''
-      # | 4. Mai 2009
-      # | 17. Mai 2011
-      # | 6. Juni 2011 (Teil 1)<br />13. Juni 2011 (Teil 2)
-      # |-
-      # |}
-      #
-      def self.parse_inarticle_season_table(table)
-        raise ArgumentError, 'String with seasontable expected' unless
-        table.is_a?(String)
-        season_data = []
-        episode_nr_line_nr   = nil
-        episode_name_line_nr = nil
-        table.split(/^\|\-.*$/).each do |tablerow|
-          tablerow.strip!
-          # skip invalid rows
-          lines = tablerow.lines.to_a
-          next unless lines.length >= 4
-          if tablerow.match(/width=\"\d+\"/)
-            # extract line numbers for needed data that
-            # are in the table header
-            lines.each_with_index do |item, index|
-              if item.match(/Nummer.*Staffel/i)
-                episode_nr_line_nr = index
-                # TODO make the following more variable
-              elsif item.match(/Deutscher.*Titel/i)
-                episode_name_line_nr = index
-              end
-            end
-          else
-            # extract episode information
-            if episode_nr_line_nr && episode_name_line_nr
-              md_nr = lines[episode_nr_line_nr].strip.match(/(\d+)/)
-              if md_nr
-                episode_nr = md_nr[1].to_i
-                md_name = lines[episode_name_line_nr].strip.match(/^\|.(.*)$/)
-                if md_name
-                  episode_name = md_name[1]
-                  episode_name.gsub!(/[\'\"\[\]]/, "")
-                  next unless episode_name.match(/\w+/)
-                  season_data[episode_nr] = episode_name.strip
-                end
-              end
-            end
-          end
-        end
-        return season_data
-      end
-      # this method will be called with a wikitable for a season
-      # as parameter and will extract all episodes from this
-      # and returns that as an array where the episode number is
-      # the index
-      #
-      # this method implements a special format that takes place in
-      # e.g. 'Prison Break' where an episode is not spread along several
-      # lines like in the method above
-      #
-      # Example for an wikitable for episodes:
-      #
-      #{| class="wikitable"
-      # |- style="color:#black; background-color:#006699"
-      # ! '''Episode''' !! '''Deutscher Titel''' !! '''Originaltitel''' !! '''Erstausstrahlung (DE)''' !! '''Erstausstrahlung (USA)'''
-      # |-
-      # |'''1''' (1-01) || Der große Plan || Pilot || 21. Juni 2007 || 29. August 2005
-      # |-
-      # |'''2''' (1-02) || Lügt Lincoln? || Allen || 21. Juni 2007 || 29. August 2005
-      # |-
-      # |'''3''' (1-03) || Vertrauenstest || Cell Test || 28. Juni 2007 || 5. September 2005
-      # |-
-      # |'''4''' (1-04) || Veronica steigt ein || Cute Poison || 28. Juni 2007 || 12. September 2005
-      #
-      def self.parse_inarticle_season_table_with_one_line(table)
-        raise ArgumentError, 'String with seasontable expected' unless
-        table.is_a?(String)
-        season_data = []
-        episode_nr_col   = nil
-        episode_name_col = nil
-        table.split(/^\|\-.*$/).each do |tablerow|
-          if tablerow.match(/!!.*!!.*!!/)
-            # extract column numbers from table header
-            tablerow.split(/!!/).each_with_index do |col,index|
-              episode_nr_col   = index if col.match(/Episode/i)
-              episode_name_col = index if col.match(/Deutsch.*Titel/i)
-            end
-          elsif tablerow.match(/\|\|.*\w+.*\|\|/)
-            tablerow.strip!
-            columns = tablerow.split(/\|\|/)
-            # the following cleanes up the column so that the following occurs
-            # " '''7''' (1-07) " => "7     1 07"
-            #
-            # we can now extract the last bunch of digits and this algorithm is
-            # some kind of format independent
-            dirty_episode_nr   = columns[episode_nr_col].gsub(/\D/, " ").strip
-            episode_nr = dirty_episode_nr.match(/(\d+)$/)[1]
-            next unless episode_nr
-            episode_name = columns[episode_name_col].strip
-            next unless episode_nr.match(/\w+/)
-            season_data[episode_nr.to_i] = episode_name
-          end
-        end
-        return season_data
-      end
-      # this method checks if the page is the main page
-      # for a series
-      #
-      # returns true if page contains the infobox that
-      # is typical for series pages in wikipedia
-      def self.is_series_main_page?(page)
-        page.match(@@SERIES_SITE_TEST_PATTERN) != nil
-      end
-      # check the site if it is a disambiguation site
-      #
-      # returns true if this site links to pages with
-      # themes with the same name
-      def self.is_disambiguation_site?(page)
-        page.match(@@DISAMBIGUATION_TEST_PATTERN) != nil
-      end
-      # test if the page contains a link to an article
-      # with an episode list
-      def self.contains_link_to_episode_list?(page)
-        page.match(@@CONTAINS_LINK_TO_EPISODE_LIST) != nil
-      end
-      # test if the page contains a episode list
-      def self.contains_inarticle_episode_list?(page)
-        page.match(@@CONTAINS_INARTICLE_EPISODE_LIST) != nil
-      end
-      # tests for the type of in article episode list
-      def self.is_episode_list_with_one_episode_per_line?(page)
-        page.match(@@IS_ONE_LINE_EPISODE_LIST) != nil
-      end
-    end
-  end
-end