RubyGems - yyyc514-syndication - Versions diffs - 0.6.1.1 - Mend

yyyc514-syndication 0.6.1.1

Files changed (24) hide show

data/CHANGES +10 -0
data/DEVELOPER +5 -0
data/IMPLEMENTATION +55 -0
data/README +228 -0
data/examples/apple.rb +24 -0
data/examples/google.rb +23 -0
data/examples/yahoo.rb +21 -0
data/lib/syndication/atom.rb +531 -0
data/lib/syndication/common.rb +289 -0
data/lib/syndication/content.rb +44 -0
data/lib/syndication/dublincore.rb +98 -0
data/lib/syndication/feedburner.rb +18 -0
data/lib/syndication/google.rb +58 -0
data/lib/syndication/podcast.rb +90 -0
data/lib/syndication/rss.rb +332 -0
data/lib/syndication/syndication.rb +49 -0
data/lib/syndication/tagsoup.rb +51 -0
data/rakefile +60 -0
data/test/atomtest.rb +190 -0
data/test/feedburntest.rb +79 -0
data/test/google.rb +91 -0
data/test/rsstest.rb +422 -0
data/test/tagsouptest.rb +86 -0
metadata +83 -0

data/lib/syndication/common.rb ADDED Viewed

@@ -0,0 +1,289 @@
+# The file common.rb contains code common to both Atom and RSS parsing.
+#
+# Copyright � mathew <meta@pobox.com> 2006.
+# Licensed under the same terms as Ruby.
+#
+# $Header: /var/cvs/syndication/syndication/lib/syndication/common.rb,v 1.4 2005/10/23 22:51:17 meta Exp $
+require 'uri'
+require 'rexml/parsers/streamparser'
+require 'rexml/streamlistener'
+require 'rexml/document'
+require 'date'
+# To parse Atom feeds, use Syndication::Atom::Parser.
+# To parse RSS feeds, use Syndication::RSS::Parser.
+module Syndication
+  # A Container is an object in the parse tree that stores data, and possibly
+  # other objects. Its naming and behavior is an internal detail, not part
+  # of the API, and hence subject to change.
+  #
+  # In other words, to use the library you don't have to know about anything
+  # below.
+  class Container
+    # Convert a tag (possibly with namespace) to a method name.
+    def tag2method(tag)
+      return tag.downcase.sub(/:/, '_') + '='
+    end
+    # Create a container.
+    # parent is the new container's parent object in the final parse tree.
+    # tag is the XML tag which caused creation of the container.
+    # attrs is a hash of {attr => value} of the XML attributes in the tag.
+    def initialize(parent, tag = nil, attrs = nil)
+      @parent = parent
+      @tag = tag
+      # and ignore attrs by default
+    end
+    # Handle a start tag and attributes.
+    # Checks to see if self has a field with the appropriate name.
+    # If so, we send it the attributes (if any), and record that the
+    # current method is the method to access that field.
+    def tag_start(tag, attrs = nil)
+      method = tag2method(tag)
+      if self.respond_to?(method)
+        if attrs
+          self.send(method, attrs)
+        end
+        @current_method = method
+      end
+    end
+    # Handle an end tag, and return what the new current object should be.
+    #
+    # If the tag matches the one we were created with, this container is
+    # complete and the new current object is its parent.
+    #
+    # If there's no parent (i.e. this is the top level container in the
+    # parse tree), the new current object must be unchanged.
+    #
+    # Otherwise, pass the end tag up to the parent to see if it can do
+    # anything with it.
+    def tag_end(endtag, current)
+      if @tag == endtag
+        return @parent
+      end
+      if @parent == nil
+        return current
+      end
+      return @parent.tag_end(endtag, current)
+    end
+    # Store an object in the parse tree, either in self, or in one of self's
+    # ancestors.
+    def store(tag, obj)
+      method = tag2method(tag)
+      if self.respond_to?(method)
+        self.send(method, obj)
+      else
+        @parent.store(tag, obj) if @parent
+      end
+    end
+    # Parse a date field on demand. DateTime.parse is sloooow, so don't call
+    # it unless you really have to.
+    def parse_date(field)
+      if !field
+        return nil
+      end
+      if field.kind_of?(String)
+        dt = DateTime.parse(field)
+        if dt.kind_of?(DateTime)
+          field = dt
+        end
+      end
+      return field
+    end
+    # Strip the parent field from a container, used to make a container
+    # more amenable to pretty-printing.
+    def strip
+      @parent = nil
+      return self
+    end
+  end
+  # Shared parts of parser code for Atom and RSS. This is an abstract class;
+  # Atom::Parser and RSS::Parser are the concrete classes which actually parse
+  # syndication feeds.
+  #
+  # You don't need to know about anything below in order to use the library.
+  #
+  # The basic parsing strategy is:
+  #
+  # - The parser keeps a current_object pointer which represents the object
+  # in the parse tree that corresponds to where we are in the XML tree. To
+  # use a metaphor, it's the object where parse tree growth is occurring.
+  #
+  # - REXML dispatches events to the parser representing start and end tags and
+  # text. The parser sends the events to the current_object, which replies with
+  # what the new current_object should be after the event has been dealt with.
+  #
+  # - The job of creating child objects when appropriate is handled by the
+  # objects of the parse tree.
+  #
+  # - Reflection is used to store data in the parse tree. Accessor names are
+  # derived from tags in a standard way once namespaces have been standardized.
+  class AbstractParser
+    include REXML::StreamListener
+    # A Hash of namespace URLs the module knows about, returning the standard
+    # prefix to remap to.
+    KNOWN_NAMESPACES = {
+      'http://purl.org/dc/elements/1.1/' => 'dc',
+      'http://purl.org/dc/terms/' => 'dcterms',
+      'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
+      'http://purl.org/rss/1.0/modules/content/' => 'content',
+      'http://www.itunes.com/DTDs/Podcast-1.0.dtd' => 'itunes',
+      'http://www.w3.org/1999/xhtml' => 'xhtml',
+      'http://schemas.google.com/g/2005' => 'gd',
+      'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner'
+    }
+    # Create a new AbstractParser. The optional argument consists of text to
+    # parse.
+    def initialize(text = nil)
+      reset
+      # Initialize mapping from tags to classes, which only needs to be done
+      # once and not reset. Concrete classes which do actual parsing will
+      # fill the hash.
+      @tag_to_class = Hash.new
+      parse(text) if text
+    end
+    # Catch any stuff that drops right through the parse tree, and simply
+    # ignore it.
+    def store(tag, obj)
+    end
+    # Catch and ignore closing tags that don't match anything open.
+    def end_tag(tag, current)
+      return current
+    end
+    # Reset the parser ready to parse a new feed.
+    def reset
+      @current_object = @parsetree
+      @tagstack = Array.new
+      @textstack = Array.new
+      @xhtml = ''
+      @xhtmlmode = false
+      @namespacemap = Hash.new
+      # @parsetree is set up by the concrete classes
+    end
+    # Parse the text provided. Returns a Syndication::Atom::Feed or
+    # Syndication::RSS::Feed object, according to which concrete Parser
+    # class is being used.
+    # The second argument is optional and determines the parser engine to
+    # use. The default is REXML. To use TagSoup, pass in the value
+    # Syndication::TagSoup
+    def parse(text, classname = REXML::Document)
+      classname.parse_stream(text, self)
+      return @parsetree
+    end
+    # Handle namespace translation for a raw tag.
+    def handle_namespace(tag, attrs = nil)
+      if attrs and tag.match(/^(rss|\w+:rdf|\w+:div)$/i)
+        for key in attrs.keys
+          if key.match(/xmlns:(\w+)/i)
+            define_namespace($1, attrs[key])
+          end
+        end
+      end
+      if tag.match(/(\w+):(\w+)/)
+        if @namespacemap[$1]
+          tag = "#{@namespacemap[$1]}:#{$2}"
+        end
+      end
+      return tag
+    end
+    # Process a namespace definition for the given prefix and namespace
+    # definition URL.
+    #
+    # If we recongnize the URL, we set up a mapping from their prefix to
+    # our canonical choice of prefix.
+    def define_namespace(prefix, url)
+      myprefix = KNOWN_NAMESPACES[url]
+      if myprefix
+        @namespacemap[prefix] = myprefix
+      end
+    end
+    # Called when REXML finds the start of an XML element.
+    def tag_start(tag, attrs) #:nodoc:
+      tag = handle_namespace(tag, attrs)
+      cl = @class_for_tag[tag.downcase]
+      if cl
+        # If the tag requires the creation of an object, we create it as a
+        # child of the current object, then ask the current object to store
+        # it. It becomes the new current object.
+        newobj = cl.new(@current_object, tag, attrs)
+        @current_object.store(tag, newobj)
+        @current_object = newobj
+      else
+        # Otherwise, we ask the current object to do something with the tag.
+        if @current_object
+          @current_object.tag_start(tag, attrs)
+        end
+      end
+      # We also push to the stacks we use for text buffering.
+      @tagstack.push(tag)
+      @textstack.push('')
+    end
+    # Called when REXML finds the end of an XML element.
+    def tag_end(endtag)
+      endtag = handle_namespace(endtag, nil)
+      # There are two tasks to perform: 1. store the data from the buffers,
+      # and 2. work out if we need to close out any objects in the parse
+      # tree and move the current object pointer
+      begin
+        # Store the top text buffer that's on the stacks by passing it to the
+        # current object along with its tag. Repeat until we find a stacked
+        # tag which matches the endtag, or run out of buffers.
+        tag = @tagstack.pop
+        text = @textstack.pop
+        if text
+          text.strip!
+          if text.length > 0 and @current_object
+            @current_object.store(tag, text)
+          end
+        end
+      end until tag == endtag or @tagstack.length == 0
+      # Pass the tag end event to the current object to find out what the
+      # new current object should be.
+      if @current_object
+        @current_object = @current_object.tag_end(endtag, @current_object)
+      end
+    end
+    # Called when REXML finds a text fragment.
+    # Buffers the text on the buffer stacks ready for the end tag.
+    def text(s)
+      if @textstack.last
+        @textstack.last << s
+      end
+    end
+    # Supposed to be called when REXML finds a CDATA-encoded piece of text.
+    def cdata(s)
+      # For content_encoded we re-encode, because (a) the API for RSS content
+      # module provides both encoded and decoded results to the user, and
+      # (b) REXML doesn't always seem to pass CDATA via this callback method.
+      # For other elements, we keep the text decoded.
+      if @textstack.last
+        if @tagstack.last == 'content:encoded'
+          @textstack.last << "<![CDATA[#{s}]]>"
+        else
+          @textstack.last << s
+        end
+      end
+    end
+  end
+end

data/lib/syndication/content.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# Copyright � mathew <meta@pobox.com> 2005.
+# Licensed under the same terms as Ruby.
+#
+# $Header$
+module Syndication
+  # Mixin for RSS 1.0 content module.
+  #
+  # This is the approved way to include actual HTML text in an RSS feed.
+  # To use it, require 'syndication/content' to add the content_encoded
+  # and content_decoded methods to the Syndication::Item class.
+  #
+  module Content
+    # Actual web content, entity encoded or CDATA-escaped.
+    attr_accessor :content_encoded
+    # Decoded version of content_encoded, as HTML.
+    def content_decoded
+      if !@content_encoded or @content_encoded == ''
+        return @content_encoded
+      end
+      # CDATA is the easier case
+      if @content_encoded.match(/<!\[CDATA\[(.*)\]\]>/m)
+        return $1
+      end
+      # Decode escaped entities
+      x = @content_encoded.gsub(/&lt;/, '<')
+      x.gsub!(/&gt;/, '>')
+      return x.gsub(/&amp;/, '&')
+    end
+  end
+  #:enddoc:
+  module RSS
+    class Item
+      include Content
+    end
+    class Channel
+      include Content
+    end
+  end
+end

data/lib/syndication/dublincore.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# Copyright � mathew <meta@pobox.com> 2005.
+# Licensed under the same terms as Ruby.
+#
+# $Header$
+module Syndication
+  # Mixin for Dublin Core metadata in RSS feeds.
+  #
+  # If you require 'syndication/dublincore' these methods are added to the
+  # Syndication::Channel, Syndication::Item, Syndication::Image and
+  # Syndication::TextInput classes.
+  #
+  # The access method names are the Dublin Core element names, prefixed with
+  # dc_.
+  #
+  module DublinCore
+    # A name by which the item is formally known.
+    attr_accessor :dc_title
+    # The entity primarily responsible for making the content of the item.
+    attr_accessor :dc_creator
+    # The topic of the content of the item, typically as keywords
+    # or key phrases.
+    attr_accessor :dc_subject
+    # A description of the content of the item.
+    attr_accessor :dc_description
+    # Entity responsible for making the item available.
+    attr_accessor :dc_publisher
+    # Entity responsible for contributing this item.
+    attr_accessor :dc_contributor
+    # Date of creation or availability of item.
+    # Returned as a DateTime if it will parse; otherwise, returned as a
+    # string. (Dublin Core does not require any particular date and time
+    # format, so guaranteeing parsing is not possible.)
+    def dc_date
+      if @dc_date and !@dc_date.kind_of?(DateTime)
+        @dc_date = DateTime.parse(@dc_date)
+      end
+      return @dc_date
+    end
+    # Date of creation or availability of item.
+    attr_writer :dc_date
+    # Nature or genre of item, usually from a controlled vocabulary.
+    attr_accessor :dc_type
+    # Physical or digital format of item.
+    attr_accessor :dc_format
+    # An unambigious identifier which identifies the item.
+    attr_accessor :dc_identifier
+    # A reference to a resource from which the item is derived.
+    attr_accessor :dc_source
+    # The language the item is in, coded as per RFC 1766.
+    attr_accessor :dc_language
+    # A reference to a related resource.
+    attr_accessor :dc_relation
+    # The extent or scope of coverage of the item, e.g. a geographical area.
+    attr_accessor :dc_coverage
+    # Information about rights held over the item, e.g. copyright or patents.
+    attr_accessor :dc_rights
+  end
+  #:enddoc:
+  module RSS
+    # Now we mix in the DublinCore elements to all the Syndication classes that
+    # can contain them. There's probably some clever way to do this via
+    # reflection, but there _is_ such a thing as being too clever.
+    class Item
+      include DublinCore
+    end
+    class Channel
+      include DublinCore
+    end
+    class Image
+      include DublinCore
+    end
+    class TextInput
+      include DublinCore
+    end
+  end
+end

data/lib/syndication/feedburner.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module Syndication
+  module Feedburner
+    module Item
+      # The original URL, before feedburner rewrote it for tracking purposes
+      attr_accessor :feedburner_origlink
+    end
+  end
+  module RSS
+    class Item
+      include Feedburner::Item
+    end
+  end
+end

data/lib/syndication/google.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright � mathew <meta@pobox.com> 2006.
+# Licensed under the same terms as Ruby.
+module Syndication
+  # Mixin for Google Data in Atom feeds.
+  #
+  # If you require 'syndication/google' these methods are added to the
+  # Syndication::Atom::Entry and Syndication::Atom::Feed classes.
+  #
+  # See http://code.google.com/apis/gdata/calendar.html for more information
+  # on Google Calendar Data APIs.
+  #
+  # See examples/google.rb for a simple example.
+  #
+  module Google
+    # Where the event is to occur
+    attr_reader :gd_where
+    def gd_where=(attrs)
+      if attrs['valueString']
+        @gd_where = attrs['valueString']
+      end
+    end
+    def gd_when=(attrs)
+      if attrs['startTime']
+        @starttime = attrs['startTime']
+      end
+      if attrs['endTime']
+        @endtime = attrs['endTime']
+      end
+    end
+    # When the event is to occur, as an Array of [start DateTime, end DateTime].
+    def gd_when
+      s = e = nil
+      if @starttime
+        s = DateTime.parse(@starttime)
+      end
+      if @endtime
+        e = DateTime.parse(@endtime)
+      end
+      return [s,e]
+    end
+  end
+  module Atom
+    class Entry
+      include Google
+    end
+    class Feed
+      include Google
+    end
+  end
+end

data/lib/syndication/podcast.rb ADDED Viewed

@@ -0,0 +1,90 @@
+# Copyright � mathew <meta@pobox.com> 2005.
+# Licensed under the same terms as Ruby.
+#
+# $Header$
+module Syndication
+  # Mixin for iTunes podcast RSS elements.
+  #
+  # To use this, require 'syndication/podcast' to add appropriate methods
+  # to the Item and Channel classes.
+  #
+  # See <URL:http://phobos.apple.com/static/iTunesRSS.html> for more
+  # information.
+  #
+  # See Syndication::Podcast::Both for methods added to both Item and
+  # Channel RSS objects.
+  #
+  # See Syndication::Podcast::Channel for methods added to Channel objects.
+  #
+  # See Syndication::Podcast::Item for methods added to Item objects.
+  #
+  module Podcast
+    # iTunes fields which occur in Items only.
+    module Item
+      # Artist column in iTunes.
+      attr_accessor :itunes_author
+      # Duration of item, in seconds.
+      attr_reader :itunes_duration
+      # Set the duration. Apple specifies four possible formats for the
+      # XML data: HH:MM:SS, H:MM:SS, MM:SS, or M:SS.
+      def itunes_duration=(x)
+        if x.match(/(\d?\d):(\d\d):(\d\d)/)
+          @itunes_duration = $3.to_i + $2.to_i * 60 + $1.to_i * 3600
+        elsif x.match(/(\d?\d):(\d\d)/)
+          @itunes_duration = $2.to_i + $1.to_i * 60
+        end
+      end
+    end
+    # iTunes fields which occur in Channels only.
+    module Channel
+      # Owner, not shown, used for contact only.
+      attr_accessor :itunes_owner
+    end
+    # iTunes fields which occur both in Channels and in Items.
+    module Both
+      # Prevent this entity from appearing in the iTunes podcast directory?
+      attr_accessor :itunes_block
+      # Parental advisory graphic?
+      attr_accessor :itunes_explicit
+      # Keywords, not shown but can be searched via iTunes.
+      attr_accessor :itunes_keywords
+      # Description column in iTunes.
+      attr_accessor :itunes_subtitle
+      # Summary, shown when i-in-circle icon is clicked in Description
+      # column of iTunes.
+      attr_accessor :itunes_summary
+      # Category column(s) in iTunes and music store browser, as an array
+      # of strings (categories then subcategories).
+      attr_reader :itunes_category
+      # Add an iTunes category; they can be nested.
+      def itunes_category=(x)
+        if !@itunes_category
+          @itunes_category = Array.new
+        end
+        @itunes_category.push(x)
+      end
+    end
+  end
+  #:enddoc:
+  module RSS
+    class Item
+      include Podcast::Item
+      include Podcast::Both
+    end
+    class Channel
+      include Podcast::Channel
+      include Podcast::Both
+    end
+  end
+end