feedparser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/HISTORY.md +3 -0
- data/Manifest.txt +20 -0
- data/README.md +95 -0
- data/Rakefile +32 -0
- data/lib/feedparser.rb +27 -0
- data/lib/feedparser/builder/atom.rb +141 -0
- data/lib/feedparser/builder/rss.rb +129 -0
- data/lib/feedparser/feed.rb +40 -0
- data/lib/feedparser/helper/atom_v03.rb +66 -0
- data/lib/feedparser/item.rb +34 -0
- data/lib/feedparser/parser.rb +53 -0
- data/lib/feedparser/version.rb +24 -0
- data/test/feeds/googlegroups.atom +18 -0
- data/test/feeds/googlegroups2.atom +20 -0
- data/test/feeds/quirksblog.atom.v03 +1098 -0
- data/test/helper.rb +35 -0
- data/test/test_atom.rb +45 -0
- data/test/test_atom_from_file.rb +28 -0
- data/test/test_atom_v03.rb +41 -0
- data/test/test_rss.rb +38 -0
- metadata +115 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ec4aa58ed835c53c649f6289eebf236e25802c69
|
4
|
+
data.tar.gz: e536aa9a2bd29ce9beb2f22765d40c5a027c920f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 50fe366a7bd6d003f280a0c291d2569ad5122f78fbe7c48f988264ddd32244182ec9684bd8c4e26ea14215c143a7d6770704dee975743b98c7a10fdd34620c1e
|
7
|
+
data.tar.gz: f6d0210aa31b7b9878a25d27c208efe47a0aca840a7253116ea04f3ec4a1da51da069a07d7741f4e645263a440e1b4245a8df677534a70d536d5f3b10348a72c
|
data/.gemtest
ADDED
File without changes
|
data/HISTORY.md
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
HISTORY.md
|
2
|
+
Manifest.txt
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
lib/feedparser.rb
|
6
|
+
lib/feedparser/builder/atom.rb
|
7
|
+
lib/feedparser/builder/rss.rb
|
8
|
+
lib/feedparser/feed.rb
|
9
|
+
lib/feedparser/helper/atom_v03.rb
|
10
|
+
lib/feedparser/item.rb
|
11
|
+
lib/feedparser/parser.rb
|
12
|
+
lib/feedparser/version.rb
|
13
|
+
test/feeds/googlegroups.atom
|
14
|
+
test/feeds/googlegroups2.atom
|
15
|
+
test/feeds/quirksblog.atom.v03
|
16
|
+
test/helper.rb
|
17
|
+
test/test_atom.rb
|
18
|
+
test/test_atom_from_file.rb
|
19
|
+
test/test_atom_v03.rb
|
20
|
+
test/test_rss.rb
|
data/README.md
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
# feedparser
|
2
|
+
|
3
|
+
feedparser gems - web feed parser and normalizer (RSS 2.0, Atom, etc.)
|
4
|
+
|
5
|
+
* home :: [github.com/rubylibs/feedutils](https://github.com/rubylibs/feedutils)
|
6
|
+
* bugs :: [github.com/rubylibs/feedutils/issues](https://github.com/rubylibs/feedutils/issues)
|
7
|
+
* gem :: [rubygems.org/gems/feedutils](https://rubygems.org/gems/feedutils)
|
8
|
+
* rdoc :: [rubydoc.info/gems/feedutils](http://rubydoc.info/gems/feedutils)
|
9
|
+
* forum :: [groups.google.com/group/feedreader](http://groups.google.com/group/feedreader)
|
10
|
+
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
### Structs
|
15
|
+
|
16
|
+
Feed • Item
|
17
|
+
|
18
|
+
### `Feed` Struct
|
19
|
+
|
20
|
+
~~~
|
21
|
+
class Feed
|
22
|
+
attr_accessor :format # e.g. atom|rss 2.0|etc.
|
23
|
+
attr_accessor :title
|
24
|
+
attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
|
25
|
+
attr_accessor :url
|
26
|
+
|
27
|
+
attr_accessor :items
|
28
|
+
|
29
|
+
attr_accessor :summary # e.g. description (rss)
|
30
|
+
attr_accessor :summary_type # e.g. text|html|html-escaped
|
31
|
+
|
32
|
+
attr_accessor :title2 # e.g. subtitle (atom)
|
33
|
+
attr_accessor :title2_type # e.g. text|html|html-escaped
|
34
|
+
|
35
|
+
attr_accessor :published
|
36
|
+
attr_accessor :updated
|
37
|
+
attr_accessor :built
|
38
|
+
|
39
|
+
attr_accessor :generator
|
40
|
+
attr_accessor :generator_version # e.g. @version (atom)
|
41
|
+
attr_accessor :generator_uri # e.g. @uri (atom) - use alias url/link ???
|
42
|
+
end
|
43
|
+
~~~
|
44
|
+
|
45
|
+
|
46
|
+
### `Item` Struct
|
47
|
+
|
48
|
+
~~~
|
49
|
+
class Item
|
50
|
+
attr_accessor :title
|
51
|
+
attr_accessor :title_type # optional for now (text|html|html-escaped) - not yet set
|
52
|
+
attr_accessor :url # todo: rename to link (use alias) ??
|
53
|
+
|
54
|
+
attr_accessor :content
|
55
|
+
attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
|
56
|
+
|
57
|
+
attr_accessor :summary
|
58
|
+
attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
|
59
|
+
|
60
|
+
attr_accessor :published
|
61
|
+
attr_accessor :updated
|
62
|
+
|
63
|
+
attr_accessor :guid # todo: rename to id (use alias) ??
|
64
|
+
end
|
65
|
+
~~~
|
66
|
+
|
67
|
+
|
68
|
+
### Read Feed Example
|
69
|
+
|
70
|
+
~~~
|
71
|
+
require 'open-uri'
|
72
|
+
require 'feedutils'
|
73
|
+
|
74
|
+
xml = open( 'http://openfootball.github.io/atom.xml' ).read
|
75
|
+
|
76
|
+
feed = FeedUtils::Parser.parse( xml )
|
77
|
+
pp feed
|
78
|
+
~~~
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
## Alternatives
|
83
|
+
|
84
|
+
- [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
|
85
|
+
- [`simple-rss`](http://rubyforge.org/projects/simple-rss)
|
86
|
+
- [`feedtools`](http://rubyforge.org/projects/feedtools)
|
87
|
+
|
88
|
+
TBD
|
89
|
+
|
90
|
+
|
91
|
+
## License
|
92
|
+
|
93
|
+
The `feedutils` scripts are dedicated to the public domain.
|
94
|
+
Use it as you please with no restrictions whatsoever.
|
95
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'hoe'
|
2
|
+
require './lib/feedparser/version.rb'
|
3
|
+
|
4
|
+
Hoe.spec 'feedparser' do
|
5
|
+
|
6
|
+
self.version = FeedParser::VERSION
|
7
|
+
|
8
|
+
self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, etc.)'
|
9
|
+
self.description = summary
|
10
|
+
|
11
|
+
self.urls = ['https://github.com/feedreader/feed.parser']
|
12
|
+
|
13
|
+
self.author = 'Gerald Bauer'
|
14
|
+
self.email = 'feedreader@googlegroups.com'
|
15
|
+
|
16
|
+
# switch extension to .markdown for gihub formatting
|
17
|
+
self.readme_file = 'README.md'
|
18
|
+
self.history_file = 'HISTORY.md'
|
19
|
+
|
20
|
+
self.extra_deps = [
|
21
|
+
['logutils', '>= 0.6.1']
|
22
|
+
]
|
23
|
+
|
24
|
+
### todo: add fetcher dep for testing (e.g. development only)
|
25
|
+
|
26
|
+
self.licenses = ['Public Domain']
|
27
|
+
|
28
|
+
self.spec_extras = {
|
29
|
+
required_ruby_version: '>= 1.9.2'
|
30
|
+
}
|
31
|
+
|
32
|
+
end
|
data/lib/feedparser.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# core and stdlibs
|
2
|
+
|
3
|
+
require 'rss'
|
4
|
+
require 'pp'
|
5
|
+
require 'date'
|
6
|
+
|
7
|
+
# 3rd party gems/libs
|
8
|
+
|
9
|
+
require 'logutils'
|
10
|
+
|
11
|
+
# our own code
|
12
|
+
|
13
|
+
require 'feedparser/version' # let it always go first
|
14
|
+
|
15
|
+
require 'feedparser/builder/atom'
|
16
|
+
require 'feedparser/builder/rss'
|
17
|
+
|
18
|
+
require 'feedparser/helper/atom_v03'
|
19
|
+
|
20
|
+
require 'feedparser/feed'
|
21
|
+
require 'feedparser/item'
|
22
|
+
require 'feedparser/parser'
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
# say hello
|
27
|
+
puts FeedParser.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
|
@@ -0,0 +1,141 @@
|
|
1
|
+
|
2
|
+
module FeedParser
|
3
|
+
|
4
|
+
class AtomFeedBuilder
|
5
|
+
|
6
|
+
include LogUtils::Logging
|
7
|
+
|
8
|
+
def initialize( atom_feed )
|
9
|
+
@feed = build_feed( atom_feed )
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_feed
|
13
|
+
@feed
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.build( atom_feed )
|
17
|
+
feed = self.new( atom_feed )
|
18
|
+
feed.to_feed
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def build_feed( atom_feed )
|
23
|
+
feed = Feed.new
|
24
|
+
## feed.object = atom_feed # not use for now
|
25
|
+
feed.format = 'atom'
|
26
|
+
|
27
|
+
feed.title = atom_feed.title.content
|
28
|
+
logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
|
29
|
+
|
30
|
+
|
31
|
+
logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
|
32
|
+
|
33
|
+
feed.url = nil
|
34
|
+
|
35
|
+
## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
|
36
|
+
atom_feed.links.each_with_index do |link,i|
|
37
|
+
logger.debug " atom | link[#{i+1}] link rel=>#{link.rel}< : #{link.rel.class.name} type=#{link.type} href=#{link.href}"
|
38
|
+
|
39
|
+
## for now assume alternate is link or no rel specified (assumes alternate)
|
40
|
+
## note: only set if feed.url is NOT already set (via <id> for example)
|
41
|
+
if feed.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
|
42
|
+
feed.url = link.href
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
## note: as fallback try id if still no url found
|
47
|
+
## use url only if starts_with http
|
48
|
+
## might not be link e.g blogger uses for ids =>
|
49
|
+
## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
|
50
|
+
##
|
51
|
+
## note: id might actually be link to feed NOT to site (remove fallback - why - why not???)
|
52
|
+
##
|
53
|
+
## Note: remove (strip) leading and trailing spaces and newlines
|
54
|
+
|
55
|
+
if feed.url.nil? && atom_feed.id.content.strip.start_with?( 'http' )
|
56
|
+
feed.url = atom_feed.id.content.strip
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
if atom_feed.updated
|
61
|
+
# NOTE: empty updated.content e.g. used by google groups feed
|
62
|
+
# will return nil : NilClass
|
63
|
+
|
64
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
65
|
+
|
66
|
+
feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
67
|
+
logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
|
68
|
+
end
|
69
|
+
|
70
|
+
if atom_feed.generator
|
71
|
+
## Note: remove (strip) leading and trailing spaces and newlines
|
72
|
+
feed.generator = atom_feed.generator.content.strip
|
73
|
+
logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
|
74
|
+
|
75
|
+
# pp atom_feed.generator
|
76
|
+
feed.generator_version = atom_feed.generator.version
|
77
|
+
feed.generator_uri = atom_feed.generator.uri
|
78
|
+
logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
|
79
|
+
logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
|
80
|
+
end
|
81
|
+
|
82
|
+
if atom_feed.subtitle
|
83
|
+
feed.title2 = atom_feed.subtitle.content
|
84
|
+
logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
items = []
|
89
|
+
atom_feed.items.each do |atom_item|
|
90
|
+
items << build_feed_item( atom_item )
|
91
|
+
end
|
92
|
+
feed.items = items
|
93
|
+
|
94
|
+
feed # return new feed
|
95
|
+
end # method build_feed_from_atom
|
96
|
+
|
97
|
+
def build_feed_item( atom_item )
|
98
|
+
item = Item.new # Item.new
|
99
|
+
## item.object = atom_item # not used for now
|
100
|
+
|
101
|
+
item.title = atom_item.title.content
|
102
|
+
item.url = atom_item.link.href
|
103
|
+
|
104
|
+
logger.debug " atom | item.title.content: >#{atom_item.title.content}< : #{atom_item.title.content.class.name}"
|
105
|
+
logger.debug " atom | item.link.href: >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
|
106
|
+
|
107
|
+
|
108
|
+
if atom_item.updated
|
109
|
+
## change time to utc if present? why? why not?
|
110
|
+
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
111
|
+
|
112
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
113
|
+
|
114
|
+
item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
|
115
|
+
logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
|
116
|
+
end
|
117
|
+
|
118
|
+
if atom_item.published
|
119
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
120
|
+
|
121
|
+
item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
|
122
|
+
logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
item.guid = atom_item.id.content
|
127
|
+
logger.debug " atom | item.id.content: >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
|
128
|
+
|
129
|
+
if atom_item.content
|
130
|
+
item.content = atom_item.content.content
|
131
|
+
end
|
132
|
+
|
133
|
+
if atom_item.summary
|
134
|
+
item.summary = atom_item.summary.content
|
135
|
+
end
|
136
|
+
|
137
|
+
item
|
138
|
+
end # method build_feed_item
|
139
|
+
|
140
|
+
end # AtomFeedBuilder
|
141
|
+
end # FeedParser
|
@@ -0,0 +1,129 @@
|
|
1
|
+
|
2
|
+
module FeedParser
|
3
|
+
|
4
|
+
### todo/fix:
|
5
|
+
# rename to Rss20FeedBuilder?? or FeedBuilderRss20 ??
|
6
|
+
|
7
|
+
class RssFeedBuilder
|
8
|
+
|
9
|
+
include LogUtils::Logging
|
10
|
+
|
11
|
+
def initialize( rss_feed )
|
12
|
+
@feed = build_feed( rss_feed )
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_feed
|
16
|
+
@feed
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.build( rss_feed )
|
20
|
+
feed = self.new( rss_feed )
|
21
|
+
feed.to_feed
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def build_feed( rss_feed )
|
26
|
+
feed = Feed.new
|
27
|
+
## feed.object = rss_feed # not use for now
|
28
|
+
feed.format = "rss #{rss_feed.rss_version}"
|
29
|
+
|
30
|
+
feed.title = rss_feed.channel.title # required
|
31
|
+
feed.url = rss_feed.channel.link # required
|
32
|
+
feed.summary = rss_feed.channel.description # required
|
33
|
+
|
34
|
+
logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
|
35
|
+
|
36
|
+
# NOTE:
|
37
|
+
# All date-times in RSS conform
|
38
|
+
# to the Date and Time Specification of RFC 822
|
39
|
+
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
40
|
+
# Sat, 07 Sep 2013 00:00:01 GMT
|
41
|
+
|
42
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
43
|
+
|
44
|
+
feed.built = rss_feed.channel.lastBuildDate.nil? ? nil : rss_feed.channel.lastBuildDate.to_datetime # optional
|
45
|
+
feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
|
46
|
+
|
47
|
+
logger.debug " rss | channel.lastBuildDate: >#{rss_feed.channel.lastBuildDate}< : #{rss_feed.channel.lastBuildDate.class.name}"
|
48
|
+
logger.debug " rss | channel.pubDate: >#{rss_feed.channel.pubDate}< : #{rss_feed.channel.pubDate.class.name}"
|
49
|
+
|
50
|
+
|
51
|
+
feed.generator = rss_feed.channel.generator # optional
|
52
|
+
|
53
|
+
logger.debug " rss | channel.generator: >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
|
54
|
+
|
55
|
+
|
56
|
+
items = []
|
57
|
+
rss_feed.items.each do |rss_item|
|
58
|
+
items << build_feed_item( rss_item )
|
59
|
+
end
|
60
|
+
feed.items = items
|
61
|
+
|
62
|
+
feed # return new feed
|
63
|
+
end
|
64
|
+
|
65
|
+
def build_feed_item( rss_item )
|
66
|
+
|
67
|
+
item = Item.new
|
68
|
+
## item.object = rss_item # not use for now
|
69
|
+
|
70
|
+
item.title = rss_item.title
|
71
|
+
item.url = rss_item.link
|
72
|
+
|
73
|
+
logger.debug " rss | item.title: >#{rss_item.title}< : #{rss_item.title.class.name}"
|
74
|
+
logger.debug " rss | item.link: >#{rss_item.link}< : #{rss_item.link.class.name}"
|
75
|
+
|
76
|
+
## todo:
|
77
|
+
## check if feedburner:origLink present - if yes, use it for url/link
|
78
|
+
## example: use
|
79
|
+
## - <feedburner:origLink>http://www.rubyflow.com/items/9803-gotta-ruby-s-syntax</feedburner:origLink></item>
|
80
|
+
## instead of
|
81
|
+
## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
|
82
|
+
|
83
|
+
|
84
|
+
item.summary = rss_item.description
|
85
|
+
|
86
|
+
# check for <content:encoded>
|
87
|
+
# -- using RSS 1.0 content module in RSS 2.0
|
88
|
+
item.content = rss_item.content_encoded
|
89
|
+
logger.debug " rss | item.content_encoded[0..40]: >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
|
90
|
+
|
91
|
+
# NOTE:
|
92
|
+
# All date-times in RSS conform
|
93
|
+
# to the Date and Time Specification of RFC 822
|
94
|
+
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
95
|
+
# Sat, 07 Sep 2013 00:00:01 GMT
|
96
|
+
|
97
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
98
|
+
|
99
|
+
item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
100
|
+
|
101
|
+
logger.debug " rss | item.pubDate: >#{rss_item.pubDate}< : #{rss_item.pubDate.class.name}"
|
102
|
+
|
103
|
+
|
104
|
+
## fix/todo: check if rss_item.guid present? !!!!
|
105
|
+
##
|
106
|
+
## might be the case e.g. check lambda-the-ultimate.org, for example
|
107
|
+
|
108
|
+
if rss_item.guid && rss_item.guid.content
|
109
|
+
item.guid = rss_item.guid.content
|
110
|
+
logger.debug " rss | item.guid.content: >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
|
111
|
+
else
|
112
|
+
item.guid = rss_item.link
|
113
|
+
logger.warn " rss | item.guid.content missing !!!! - using link for guid"
|
114
|
+
end
|
115
|
+
|
116
|
+
### todo: add support or authors (incl. dc:creator)
|
117
|
+
## <dc:creator>Dhaivat Pandya</dc:creator>
|
118
|
+
|
119
|
+
# todo: categories
|
120
|
+
# <category><![CDATA[Gems]]></category>
|
121
|
+
# <category><![CDATA[Ruby]]></category>
|
122
|
+
# <category><![CDATA[Ruby on Rails]]></category>
|
123
|
+
|
124
|
+
|
125
|
+
item
|
126
|
+
end # method build_feed_item_from_rss
|
127
|
+
|
128
|
+
end # class RssFeedBuilder
|
129
|
+
end # module FeedParser
|