feedparser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ec4aa58ed835c53c649f6289eebf236e25802c69
4
+ data.tar.gz: e536aa9a2bd29ce9beb2f22765d40c5a027c920f
5
+ SHA512:
6
+ metadata.gz: 50fe366a7bd6d003f280a0c291d2569ad5122f78fbe7c48f988264ddd32244182ec9684bd8c4e26ea14215c143a7d6770704dee975743b98c7a10fdd34620c1e
7
+ data.tar.gz: f6d0210aa31b7b9878a25d27c208efe47a0aca840a7253116ea04f3ec4a1da51da069a07d7741f4e645263a440e1b4245a8df677534a70d536d5f3b10348a72c
File without changes
@@ -0,0 +1,3 @@
1
+ ### 0.1.0 / 2013-09-19
2
+
3
+ * Everything is new. First release.
@@ -0,0 +1,20 @@
1
+ HISTORY.md
2
+ Manifest.txt
3
+ README.md
4
+ Rakefile
5
+ lib/feedparser.rb
6
+ lib/feedparser/builder/atom.rb
7
+ lib/feedparser/builder/rss.rb
8
+ lib/feedparser/feed.rb
9
+ lib/feedparser/helper/atom_v03.rb
10
+ lib/feedparser/item.rb
11
+ lib/feedparser/parser.rb
12
+ lib/feedparser/version.rb
13
+ test/feeds/googlegroups.atom
14
+ test/feeds/googlegroups2.atom
15
+ test/feeds/quirksblog.atom.v03
16
+ test/helper.rb
17
+ test/test_atom.rb
18
+ test/test_atom_from_file.rb
19
+ test/test_atom_v03.rb
20
+ test/test_rss.rb
@@ -0,0 +1,95 @@
1
+ # feedparser
2
+
3
+ feedparser gems - web feed parser and normalizer (RSS 2.0, Atom, etc.)
4
+
5
+ * home :: [github.com/rubylibs/feedutils](https://github.com/rubylibs/feedutils)
6
+ * bugs :: [github.com/rubylibs/feedutils/issues](https://github.com/rubylibs/feedutils/issues)
7
+ * gem :: [rubygems.org/gems/feedutils](https://rubygems.org/gems/feedutils)
8
+ * rdoc :: [rubydoc.info/gems/feedutils](http://rubydoc.info/gems/feedutils)
9
+ * forum :: [groups.google.com/group/feedreader](http://groups.google.com/group/feedreader)
10
+
11
+
12
+ ## Usage
13
+
14
+ ### Structs
15
+
16
+ Feed • Item
17
+
18
+ ### `Feed` Struct
19
+
20
+ ~~~
21
+ class Feed
22
+ attr_accessor :format # e.g. atom|rss 2.0|etc.
23
+ attr_accessor :title
24
+ attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
25
+ attr_accessor :url
26
+
27
+ attr_accessor :items
28
+
29
+ attr_accessor :summary # e.g. description (rss)
30
+ attr_accessor :summary_type # e.g. text|html|html-escaped
31
+
32
+ attr_accessor :title2 # e.g. subtitle (atom)
33
+ attr_accessor :title2_type # e.g. text|html|html-escaped
34
+
35
+ attr_accessor :published
36
+ attr_accessor :updated
37
+ attr_accessor :built
38
+
39
+ attr_accessor :generator
40
+ attr_accessor :generator_version # e.g. @version (atom)
41
+ attr_accessor :generator_uri # e.g. @uri (atom) - use alias url/link ???
42
+ end
43
+ ~~~
44
+
45
+
46
+ ### `Item` Struct
47
+
48
+ ~~~
49
+ class Item
50
+ attr_accessor :title
51
+ attr_accessor :title_type # optional for now (text|html|html-escaped) - not yet set
52
+ attr_accessor :url # todo: rename to link (use alias) ??
53
+
54
+ attr_accessor :content
55
+ attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
56
+
57
+ attr_accessor :summary
58
+ attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
59
+
60
+ attr_accessor :published
61
+ attr_accessor :updated
62
+
63
+ attr_accessor :guid # todo: rename to id (use alias) ??
64
+ end
65
+ ~~~
66
+
67
+
68
+ ### Read Feed Example
69
+
70
+ ~~~
71
+ require 'open-uri'
72
+ require 'feedutils'
73
+
74
+ xml = open( 'http://openfootball.github.io/atom.xml' ).read
75
+
76
+ feed = FeedUtils::Parser.parse( xml )
77
+ pp feed
78
+ ~~~
79
+
80
+
81
+
82
+ ## Alternatives
83
+
84
+ - [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
85
+ - [`simple-rss`](http://rubyforge.org/projects/simple-rss)
86
+ - [`feedtools`](http://rubyforge.org/projects/feedtools)
87
+
88
+ TBD
89
+
90
+
91
+ ## License
92
+
93
+ The `feedutils` scripts are dedicated to the public domain.
94
+ Use it as you please with no restrictions whatsoever.
95
+
@@ -0,0 +1,32 @@
1
+ require 'hoe'
2
+ require './lib/feedparser/version.rb'
3
+
4
+ Hoe.spec 'feedparser' do
5
+
6
+ self.version = FeedParser::VERSION
7
+
8
+ self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, etc.)'
9
+ self.description = summary
10
+
11
+ self.urls = ['https://github.com/feedreader/feed.parser']
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'feedreader@googlegroups.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'HISTORY.md'
19
+
20
+ self.extra_deps = [
21
+ ['logutils', '>= 0.6.1']
22
+ ]
23
+
24
+ ### todo: add fetcher dep for testing (e.g. development only)
25
+
26
+ self.licenses = ['Public Domain']
27
+
28
+ self.spec_extras = {
29
+ required_ruby_version: '>= 1.9.2'
30
+ }
31
+
32
+ end
@@ -0,0 +1,27 @@
1
+ # core and stdlibs
2
+
3
+ require 'rss'
4
+ require 'pp'
5
+ require 'date'
6
+
7
+ # 3rd party gems/libs
8
+
9
+ require 'logutils'
10
+
11
+ # our own code
12
+
13
+ require 'feedparser/version' # let it always go first
14
+
15
+ require 'feedparser/builder/atom'
16
+ require 'feedparser/builder/rss'
17
+
18
+ require 'feedparser/helper/atom_v03'
19
+
20
+ require 'feedparser/feed'
21
+ require 'feedparser/item'
22
+ require 'feedparser/parser'
23
+
24
+
25
+
26
+ # say hello
27
+ puts FeedParser.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
@@ -0,0 +1,141 @@
1
+
2
+ module FeedParser
3
+
4
+ class AtomFeedBuilder
5
+
6
+ include LogUtils::Logging
7
+
8
+ def initialize( atom_feed )
9
+ @feed = build_feed( atom_feed )
10
+ end
11
+
12
+ def to_feed
13
+ @feed
14
+ end
15
+
16
+ def self.build( atom_feed )
17
+ feed = self.new( atom_feed )
18
+ feed.to_feed
19
+ end
20
+
21
+
22
+ def build_feed( atom_feed )
23
+ feed = Feed.new
24
+ ## feed.object = atom_feed # not use for now
25
+ feed.format = 'atom'
26
+
27
+ feed.title = atom_feed.title.content
28
+ logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
29
+
30
+
31
+ logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
32
+
33
+ feed.url = nil
34
+
35
+ ## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
36
+ atom_feed.links.each_with_index do |link,i|
37
+ logger.debug " atom | link[#{i+1}] link rel=>#{link.rel}< : #{link.rel.class.name} type=#{link.type} href=#{link.href}"
38
+
39
+ ## for now assume alternate is link or no rel specified (assumes alternate)
40
+ ## note: only set if feed.url is NOT already set (via <id> for example)
41
+ if feed.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
42
+ feed.url = link.href
43
+ end
44
+ end
45
+
46
+ ## note: as fallback try id if still no url found
47
+ ## use url only if starts_with http
48
+ ## might not be link e.g blogger uses for ids =>
49
+ ## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
50
+ ##
51
+ ## note: id might actually be link to feed NOT to site (remove fallback - why - why not???)
52
+ ##
53
+ ## Note: remove (strip) leading and trailing spaces and newlines
54
+
55
+ if feed.url.nil? && atom_feed.id.content.strip.start_with?( 'http' )
56
+ feed.url = atom_feed.id.content.strip
57
+ end
58
+
59
+
60
+ if atom_feed.updated
61
+ # NOTE: empty updated.content e.g. used by google groups feed
62
+ # will return nil : NilClass
63
+
64
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
65
+
66
+ feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
67
+ logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
68
+ end
69
+
70
+ if atom_feed.generator
71
+ ## Note: remove (strip) leading and trailing spaces and newlines
72
+ feed.generator = atom_feed.generator.content.strip
73
+ logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
74
+
75
+ # pp atom_feed.generator
76
+ feed.generator_version = atom_feed.generator.version
77
+ feed.generator_uri = atom_feed.generator.uri
78
+ logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
79
+ logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
80
+ end
81
+
82
+ if atom_feed.subtitle
83
+ feed.title2 = atom_feed.subtitle.content
84
+ logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
85
+ end
86
+
87
+
88
+ items = []
89
+ atom_feed.items.each do |atom_item|
90
+ items << build_feed_item( atom_item )
91
+ end
92
+ feed.items = items
93
+
94
+ feed # return new feed
95
+ end # method build_feed_from_atom
96
+
97
+ def build_feed_item( atom_item )
98
+ item = Item.new # Item.new
99
+ ## item.object = atom_item # not used for now
100
+
101
+ item.title = atom_item.title.content
102
+ item.url = atom_item.link.href
103
+
104
+ logger.debug " atom | item.title.content: >#{atom_item.title.content}< : #{atom_item.title.content.class.name}"
105
+ logger.debug " atom | item.link.href: >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
106
+
107
+
108
+ if atom_item.updated
109
+ ## change time to utc if present? why? why not?
110
+ # -- .utc.strftime( "%Y-%m-%d %H:%M" )
111
+
112
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
113
+
114
+ item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
115
+ logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
116
+ end
117
+
118
+ if atom_item.published
119
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
120
+
121
+ item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
122
+ logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
123
+ end
124
+
125
+
126
+ item.guid = atom_item.id.content
127
+ logger.debug " atom | item.id.content: >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
128
+
129
+ if atom_item.content
130
+ item.content = atom_item.content.content
131
+ end
132
+
133
+ if atom_item.summary
134
+ item.summary = atom_item.summary.content
135
+ end
136
+
137
+ item
138
+ end # method build_feed_item
139
+
140
+ end # AtomFeedBuilder
141
+ end # FeedParser
@@ -0,0 +1,129 @@
1
+
2
+ module FeedParser
3
+
4
+ ### todo/fix:
5
+ # rename to Rss20FeedBuilder?? or FeedBuilderRss20 ??
6
+
7
+ class RssFeedBuilder
8
+
9
+ include LogUtils::Logging
10
+
11
+ def initialize( rss_feed )
12
+ @feed = build_feed( rss_feed )
13
+ end
14
+
15
+ def to_feed
16
+ @feed
17
+ end
18
+
19
+ def self.build( rss_feed )
20
+ feed = self.new( rss_feed )
21
+ feed.to_feed
22
+ end
23
+
24
+
25
+ def build_feed( rss_feed )
26
+ feed = Feed.new
27
+ ## feed.object = rss_feed # not use for now
28
+ feed.format = "rss #{rss_feed.rss_version}"
29
+
30
+ feed.title = rss_feed.channel.title # required
31
+ feed.url = rss_feed.channel.link # required
32
+ feed.summary = rss_feed.channel.description # required
33
+
34
+ logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
35
+
36
+ # NOTE:
37
+ # All date-times in RSS conform
38
+ # to the Date and Time Specification of RFC 822
39
+ # e.g. Sun, 19 May 2012 15:21:36 GMT or
40
+ # Sat, 07 Sep 2013 00:00:01 GMT
41
+
42
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
43
+
44
+ feed.built = rss_feed.channel.lastBuildDate.nil? ? nil : rss_feed.channel.lastBuildDate.to_datetime # optional
45
+ feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
46
+
47
+ logger.debug " rss | channel.lastBuildDate: >#{rss_feed.channel.lastBuildDate}< : #{rss_feed.channel.lastBuildDate.class.name}"
48
+ logger.debug " rss | channel.pubDate: >#{rss_feed.channel.pubDate}< : #{rss_feed.channel.pubDate.class.name}"
49
+
50
+
51
+ feed.generator = rss_feed.channel.generator # optional
52
+
53
+ logger.debug " rss | channel.generator: >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
54
+
55
+
56
+ items = []
57
+ rss_feed.items.each do |rss_item|
58
+ items << build_feed_item( rss_item )
59
+ end
60
+ feed.items = items
61
+
62
+ feed # return new feed
63
+ end
64
+
65
+ def build_feed_item( rss_item )
66
+
67
+ item = Item.new
68
+ ## item.object = rss_item # not use for now
69
+
70
+ item.title = rss_item.title
71
+ item.url = rss_item.link
72
+
73
+ logger.debug " rss | item.title: >#{rss_item.title}< : #{rss_item.title.class.name}"
74
+ logger.debug " rss | item.link: >#{rss_item.link}< : #{rss_item.link.class.name}"
75
+
76
+ ## todo:
77
+ ## check if feedburner:origLink present - if yes, use it for url/link
78
+ ## example: use
79
+ ## - <feedburner:origLink>http://www.rubyflow.com/items/9803-gotta-ruby-s-syntax</feedburner:origLink></item>
80
+ ## instead of
81
+ ## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
82
+
83
+
84
+ item.summary = rss_item.description
85
+
86
+ # check for <content:encoded>
87
+ # -- using RSS 1.0 content module in RSS 2.0
88
+ item.content = rss_item.content_encoded
89
+ logger.debug " rss | item.content_encoded[0..40]: >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
90
+
91
+ # NOTE:
92
+ # All date-times in RSS conform
93
+ # to the Date and Time Specification of RFC 822
94
+ # e.g. Sun, 19 May 2012 15:21:36 GMT or
95
+ # Sat, 07 Sep 2013 00:00:01 GMT
96
+
97
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
98
+
99
+ item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
100
+
101
+ logger.debug " rss | item.pubDate: >#{rss_item.pubDate}< : #{rss_item.pubDate.class.name}"
102
+
103
+
104
+ ## fix/todo: check if rss_item.guid present? !!!!
105
+ ##
106
+ ## might be the case e.g. check lambda-the-ultimate.org, for example
107
+
108
+ if rss_item.guid && rss_item.guid.content
109
+ item.guid = rss_item.guid.content
110
+ logger.debug " rss | item.guid.content: >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
111
+ else
112
+ item.guid = rss_item.link
113
+ logger.warn " rss | item.guid.content missing !!!! - using link for guid"
114
+ end
115
+
116
+ ### todo: add support or authors (incl. dc:creator)
117
+ ## <dc:creator>Dhaivat Pandya</dc:creator>
118
+
119
+ # todo: categories
120
+ # <category><![CDATA[Gems]]></category>
121
+ # <category><![CDATA[Ruby]]></category>
122
+ # <category><![CDATA[Ruby on Rails]]></category>
123
+
124
+
125
+ item
126
+ end # method build_feed_item_from_rss
127
+
128
+ end # class RssFeedBuilder
129
+ end # module FeedParser