feedparser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ec4aa58ed835c53c649f6289eebf236e25802c69
4
+ data.tar.gz: e536aa9a2bd29ce9beb2f22765d40c5a027c920f
5
+ SHA512:
6
+ metadata.gz: 50fe366a7bd6d003f280a0c291d2569ad5122f78fbe7c48f988264ddd32244182ec9684bd8c4e26ea14215c143a7d6770704dee975743b98c7a10fdd34620c1e
7
+ data.tar.gz: f6d0210aa31b7b9878a25d27c208efe47a0aca840a7253116ea04f3ec4a1da51da069a07d7741f4e645263a440e1b4245a8df677534a70d536d5f3b10348a72c
File without changes
@@ -0,0 +1,3 @@
1
+ ### 0.1.0 / 2013-09-19
2
+
3
+ * Everything is new. First release.
@@ -0,0 +1,20 @@
1
+ HISTORY.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ lib/feedparser.rb
6
+ lib/feedparser/builder/atom.rb
7
+ lib/feedparser/builder/rss.rb
8
+ lib/feedparser/feed.rb
9
+ lib/feedparser/helper/atom_v03.rb
10
+ lib/feedparser/item.rb
11
+ lib/feedparser/parser.rb
12
+ lib/feedparser/version.rb
13
+ test/feeds/googlegroups.atom
14
+ test/feeds/googlegroups2.atom
15
+ test/feeds/quirksblog.atom.v03
16
+ test/helper.rb
17
+ test/test_atom.rb
18
+ test/test_atom_from_file.rb
19
+ test/test_atom_v03.rb
20
+ test/test_rss.rb
@@ -0,0 +1,95 @@
1
+ # feedparser
2
+
3
+ feedparser gems - web feed parser and normalizer (RSS 2.0, Atom, etc.)
4
+
5
+ * home :: [github.com/rubylibs/feedutils](https://github.com/rubylibs/feedutils)
6
+ * bugs :: [github.com/rubylibs/feedutils/issues](https://github.com/rubylibs/feedutils/issues)
7
+ * gem :: [rubygems.org/gems/feedutils](https://rubygems.org/gems/feedutils)
8
+ * rdoc :: [rubydoc.info/gems/feedutils](http://rubydoc.info/gems/feedutils)
9
+ * forum :: [groups.google.com/group/feedreader](http://groups.google.com/group/feedreader)
10
+
11
+
12
+ ## Usage
13
+
14
+ ### Structs
15
+
16
+ Feed • Item
17
+
18
+ ### `Feed` Struct
19
+
20
+ ~~~
21
+ class Feed
22
+ attr_accessor :format # e.g. atom|rss 2.0|etc.
23
+ attr_accessor :title
24
+ attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
25
+ attr_accessor :url
26
+
27
+ attr_accessor :items
28
+
29
+ attr_accessor :summary # e.g. description (rss)
30
+ attr_accessor :summary_type # e.g. text|html|html-escaped
31
+
32
+ attr_accessor :title2 # e.g. subtitle (atom)
33
+ attr_accessor :title2_type # e.g. text|html|html-escaped
34
+
35
+ attr_accessor :published
36
+ attr_accessor :updated
37
+ attr_accessor :built
38
+
39
+ attr_accessor :generator
40
+ attr_accessor :generator_version # e.g. @version (atom)
41
+ attr_accessor :generator_uri # e.g. @uri (atom) - use alias url/link ???
42
+ end
43
+ ~~~
44
+
45
+
46
+ ### `Item` Struct
47
+
48
+ ~~~
49
+ class Item
50
+ attr_accessor :title
51
+ attr_accessor :title_type # optional for now (text|html|html-escaped) - not yet set
52
+ attr_accessor :url # todo: rename to link (use alias) ??
53
+
54
+ attr_accessor :content
55
+ attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
56
+
57
+ attr_accessor :summary
58
+ attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
59
+
60
+ attr_accessor :published
61
+ attr_accessor :updated
62
+
63
+ attr_accessor :guid # todo: rename to id (use alias) ??
64
+ end
65
+ ~~~
66
+
67
+
68
+ ### Read Feed Example
69
+
70
+ ~~~
71
+ require 'open-uri'
72
+ require 'feedutils'
73
+
74
+ xml = open( 'http://openfootball.github.io/atom.xml' ).read
75
+
76
+ feed = FeedUtils::Parser.parse( xml )
77
+ pp feed
78
+ ~~~
79
+
80
+
81
+
82
+ ## Alternatives
83
+
84
+ - [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
85
+ - [`simple-rss`](http://rubyforge.org/projects/simple-rss)
86
+ - [`feedtools`](http://rubyforge.org/projects/feedtools)
87
+
88
+ TBD
89
+
90
+
91
+ ## License
92
+
93
+ The `feedutils` scripts are dedicated to the public domain.
94
+ Use it as you please with no restrictions whatsoever.
95
+
@@ -0,0 +1,32 @@
1
+ require 'hoe'
2
+ require './lib/feedparser/version.rb'
3
+
4
+ Hoe.spec 'feedparser' do
5
+
6
+ self.version = FeedParser::VERSION
7
+
8
+ self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, etc.)'
9
+ self.description = summary
10
+
11
+ self.urls = ['https://github.com/feedreader/feed.parser']
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'feedreader@googlegroups.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'HISTORY.md'
19
+
20
+ self.extra_deps = [
21
+ ['logutils', '>= 0.6.1']
22
+ ]
23
+
24
+ ### todo: add fetcher dep for testing (e.g. development only)
25
+
26
+ self.licenses = ['Public Domain']
27
+
28
+ self.spec_extras = {
29
+ required_ruby_version: '>= 1.9.2'
30
+ }
31
+
32
+ end
@@ -0,0 +1,27 @@
1
+ # core and stdlibs
2
+
3
+ require 'rss'
4
+ require 'pp'
5
+ require 'date'
6
+
7
+ # 3rd party gems/libs
8
+
9
+ require 'logutils'
10
+
11
+ # our own code
12
+
13
+ require 'feedparser/version' # let it always go first
14
+
15
+ require 'feedparser/builder/atom'
16
+ require 'feedparser/builder/rss'
17
+
18
+ require 'feedparser/helper/atom_v03'
19
+
20
+ require 'feedparser/feed'
21
+ require 'feedparser/item'
22
+ require 'feedparser/parser'
23
+
24
+
25
+
26
+ # say hello
27
+ puts FeedParser.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
@@ -0,0 +1,141 @@
1
+
2
+ module FeedParser
3
+
4
+ class AtomFeedBuilder
5
+
6
+ include LogUtils::Logging
7
+
8
+ def initialize( atom_feed )
9
+ @feed = build_feed( atom_feed )
10
+ end
11
+
12
+ def to_feed
13
+ @feed
14
+ end
15
+
16
+ def self.build( atom_feed )
17
+ feed = self.new( atom_feed )
18
+ feed.to_feed
19
+ end
20
+
21
+
22
+ def build_feed( atom_feed )
23
+ feed = Feed.new
24
+ ## feed.object = atom_feed # not use for now
25
+ feed.format = 'atom'
26
+
27
+ feed.title = atom_feed.title.content
28
+ logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
29
+
30
+
31
+ logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
32
+
33
+ feed.url = nil
34
+
35
+ ## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
36
+ atom_feed.links.each_with_index do |link,i|
37
+ logger.debug " atom | link[#{i+1}] link rel=>#{link.rel}< : #{link.rel.class.name} type=#{link.type} href=#{link.href}"
38
+
39
+ ## for now assume alternate is link or no rel specified (assumes alternate)
40
+ ## note: only set if feed.url is NOT already set (via <id> for example)
41
+ if feed.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
42
+ feed.url = link.href
43
+ end
44
+ end
45
+
46
+ ## note: as fallback try id if still no url found
47
+ ## use url only if starts_with http
48
+ ## might not be link e.g blogger uses for ids =>
49
+ ## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
50
+ ##
51
+ ## note: id might actually be link to feed NOT to site (remove fallback - why - why not???)
52
+ ##
53
+ ## Note: remove (strip) leading and trailing spaces and newlines
54
+
55
+ if feed.url.nil? && atom_feed.id.content.strip.start_with?( 'http' )
56
+ feed.url = atom_feed.id.content.strip
57
+ end
58
+
59
+
60
+ if atom_feed.updated
61
+ # NOTE: empty updated.content e.g. used by google groups feed
62
+ # will return nil : NilClass
63
+
64
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
65
+
66
+ feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
67
+ logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
68
+ end
69
+
70
+ if atom_feed.generator
71
+ ## Note: remove (strip) leading and trailing spaces and newlines
72
+ feed.generator = atom_feed.generator.content.strip
73
+ logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
74
+
75
+ # pp atom_feed.generator
76
+ feed.generator_version = atom_feed.generator.version
77
+ feed.generator_uri = atom_feed.generator.uri
78
+ logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
79
+ logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
80
+ end
81
+
82
+ if atom_feed.subtitle
83
+ feed.title2 = atom_feed.subtitle.content
84
+ logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
85
+ end
86
+
87
+
88
+ items = []
89
+ atom_feed.items.each do |atom_item|
90
+ items << build_feed_item( atom_item )
91
+ end
92
+ feed.items = items
93
+
94
+ feed # return new feed
95
+ end # method build_feed_from_atom
96
+
97
+ def build_feed_item( atom_item )
98
+ item = Item.new # Item.new
99
+ ## item.object = atom_item # not used for now
100
+
101
+ item.title = atom_item.title.content
102
+ item.url = atom_item.link.href
103
+
104
+ logger.debug " atom | item.title.content: >#{atom_item.title.content}< : #{atom_item.title.content.class.name}"
105
+ logger.debug " atom | item.link.href: >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
106
+
107
+
108
+ if atom_item.updated
109
+ ## change time to utc if present? why? why not?
110
+ # -- .utc.strftime( "%Y-%m-%d %H:%M" )
111
+
112
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
113
+
114
+ item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
115
+ logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
116
+ end
117
+
118
+ if atom_item.published
119
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
120
+
121
+ item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
122
+ logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
123
+ end
124
+
125
+
126
+ item.guid = atom_item.id.content
127
+ logger.debug " atom | item.id.content: >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
128
+
129
+ if atom_item.content
130
+ item.content = atom_item.content.content
131
+ end
132
+
133
+ if atom_item.summary
134
+ item.summary = atom_item.summary.content
135
+ end
136
+
137
+ item
138
+ end # method build_feed_item
139
+
140
+ end # AtomFeedBuilder
141
+ end # FeedParser
@@ -0,0 +1,129 @@
1
+
2
+ module FeedParser
3
+
4
+ ### todo/fix:
5
+ # rename to Rss20FeedBuilder?? or FeedBuilderRss20 ??
6
+
7
+ class RssFeedBuilder
8
+
9
+ include LogUtils::Logging
10
+
11
+ def initialize( rss_feed )
12
+ @feed = build_feed( rss_feed )
13
+ end
14
+
15
+ def to_feed
16
+ @feed
17
+ end
18
+
19
+ def self.build( rss_feed )
20
+ feed = self.new( rss_feed )
21
+ feed.to_feed
22
+ end
23
+
24
+
25
+ def build_feed( rss_feed )
26
+ feed = Feed.new
27
+ ## feed.object = rss_feed # not use for now
28
+ feed.format = "rss #{rss_feed.rss_version}"
29
+
30
+ feed.title = rss_feed.channel.title # required
31
+ feed.url = rss_feed.channel.link # required
32
+ feed.summary = rss_feed.channel.description # required
33
+
34
+ logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
35
+
36
+ # NOTE:
37
+ # All date-times in RSS conform
38
+ # to the Date and Time Specification of RFC 822
39
+ # e.g. Sun, 19 May 2012 15:21:36 GMT or
40
+ # Sat, 07 Sep 2013 00:00:01 GMT
41
+
42
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
43
+
44
+ feed.built = rss_feed.channel.lastBuildDate.nil? ? nil : rss_feed.channel.lastBuildDate.to_datetime # optional
45
+ feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
46
+
47
+ logger.debug " rss | channel.lastBuildDate: >#{rss_feed.channel.lastBuildDate}< : #{rss_feed.channel.lastBuildDate.class.name}"
48
+ logger.debug " rss | channel.pubDate: >#{rss_feed.channel.pubDate}< : #{rss_feed.channel.pubDate.class.name}"
49
+
50
+
51
+ feed.generator = rss_feed.channel.generator # optional
52
+
53
+ logger.debug " rss | channel.generator: >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
54
+
55
+
56
+ items = []
57
+ rss_feed.items.each do |rss_item|
58
+ items << build_feed_item( rss_item )
59
+ end
60
+ feed.items = items
61
+
62
+ feed # return new feed
63
+ end
64
+
65
+ def build_feed_item( rss_item )
66
+
67
+ item = Item.new
68
+ ## item.object = rss_item # not use for now
69
+
70
+ item.title = rss_item.title
71
+ item.url = rss_item.link
72
+
73
+ logger.debug " rss | item.title: >#{rss_item.title}< : #{rss_item.title.class.name}"
74
+ logger.debug " rss | item.link: >#{rss_item.link}< : #{rss_item.link.class.name}"
75
+
76
+ ## todo:
77
+ ## check if feedburner:origLink present - if yes, use it for url/link
78
+ ## example: use
79
+ ## - <feedburner:origLink>http://www.rubyflow.com/items/9803-gotta-ruby-s-syntax</feedburner:origLink></item>
80
+ ## instead of
81
+ ## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
82
+
83
+
84
+ item.summary = rss_item.description
85
+
86
+ # check for <content:encoded>
87
+ # -- using RSS 1.0 content module in RSS 2.0
88
+ item.content = rss_item.content_encoded
89
+ logger.debug " rss | item.content_encoded[0..40]: >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
90
+
91
+ # NOTE:
92
+ # All date-times in RSS conform
93
+ # to the Date and Time Specification of RFC 822
94
+ # e.g. Sun, 19 May 2012 15:21:36 GMT or
95
+ # Sat, 07 Sep 2013 00:00:01 GMT
96
+
97
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
98
+
99
+ item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
100
+
101
+ logger.debug " rss | item.pubDate: >#{rss_item.pubDate}< : #{rss_item.pubDate.class.name}"
102
+
103
+
104
+ ## fix/todo: check if rss_item.guid present? !!!!
105
+ ##
106
+ ## might be the case e.g. check lambda-the-ultimate.org, for example
107
+
108
+ if rss_item.guid && rss_item.guid.content
109
+ item.guid = rss_item.guid.content
110
+ logger.debug " rss | item.guid.content: >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
111
+ else
112
+ item.guid = rss_item.link
113
+ logger.warn " rss | item.guid.content missing !!!! - using link for guid"
114
+ end
115
+
116
+ ### todo: add support or authors (incl. dc:creator)
117
+ ## <dc:creator>Dhaivat Pandya</dc:creator>
118
+
119
+ # todo: categories
120
+ # <category><![CDATA[Gems]]></category>
121
+ # <category><![CDATA[Ruby]]></category>
122
+ # <category><![CDATA[Ruby on Rails]]></category>
123
+
124
+
125
+ item
126
+ end # method build_feed_item_from_rss
127
+
128
+ end # class RssFeedBuilder
129
+ end # module FeedParser