feedparser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/HISTORY.md +3 -0
- data/Manifest.txt +20 -0
- data/README.md +95 -0
- data/Rakefile +32 -0
- data/lib/feedparser.rb +27 -0
- data/lib/feedparser/builder/atom.rb +141 -0
- data/lib/feedparser/builder/rss.rb +129 -0
- data/lib/feedparser/feed.rb +40 -0
- data/lib/feedparser/helper/atom_v03.rb +66 -0
- data/lib/feedparser/item.rb +34 -0
- data/lib/feedparser/parser.rb +53 -0
- data/lib/feedparser/version.rb +24 -0
- data/test/feeds/googlegroups.atom +18 -0
- data/test/feeds/googlegroups2.atom +20 -0
- data/test/feeds/quirksblog.atom.v03 +1098 -0
- data/test/helper.rb +35 -0
- data/test/test_atom.rb +45 -0
- data/test/test_atom_from_file.rb +28 -0
- data/test/test_atom_v03.rb +41 -0
- data/test/test_rss.rb +38 -0
- metadata +115 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ec4aa58ed835c53c649f6289eebf236e25802c69
|
4
|
+
data.tar.gz: e536aa9a2bd29ce9beb2f22765d40c5a027c920f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 50fe366a7bd6d003f280a0c291d2569ad5122f78fbe7c48f988264ddd32244182ec9684bd8c4e26ea14215c143a7d6770704dee975743b98c7a10fdd34620c1e
|
7
|
+
data.tar.gz: f6d0210aa31b7b9878a25d27c208efe47a0aca840a7253116ea04f3ec4a1da51da069a07d7741f4e645263a440e1b4245a8df677534a70d536d5f3b10348a72c
|
data/.gemtest
ADDED
File without changes
|
data/HISTORY.md
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
HISTORY.md
|
2
|
+
Manifest.txt
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
lib/feedparser.rb
|
6
|
+
lib/feedparser/builder/atom.rb
|
7
|
+
lib/feedparser/builder/rss.rb
|
8
|
+
lib/feedparser/feed.rb
|
9
|
+
lib/feedparser/helper/atom_v03.rb
|
10
|
+
lib/feedparser/item.rb
|
11
|
+
lib/feedparser/parser.rb
|
12
|
+
lib/feedparser/version.rb
|
13
|
+
test/feeds/googlegroups.atom
|
14
|
+
test/feeds/googlegroups2.atom
|
15
|
+
test/feeds/quirksblog.atom.v03
|
16
|
+
test/helper.rb
|
17
|
+
test/test_atom.rb
|
18
|
+
test/test_atom_from_file.rb
|
19
|
+
test/test_atom_v03.rb
|
20
|
+
test/test_rss.rb
|
data/README.md
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
# feedparser
|
2
|
+
|
3
|
+
feedparser gems - web feed parser and normalizer (RSS 2.0, Atom, etc.)
|
4
|
+
|
5
|
+
* home :: [github.com/rubylibs/feedutils](https://github.com/rubylibs/feedutils)
|
6
|
+
* bugs :: [github.com/rubylibs/feedutils/issues](https://github.com/rubylibs/feedutils/issues)
|
7
|
+
* gem :: [rubygems.org/gems/feedutils](https://rubygems.org/gems/feedutils)
|
8
|
+
* rdoc :: [rubydoc.info/gems/feedutils](http://rubydoc.info/gems/feedutils)
|
9
|
+
* forum :: [groups.google.com/group/feedreader](http://groups.google.com/group/feedreader)
|
10
|
+
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
### Structs
|
15
|
+
|
16
|
+
Feed • Item
|
17
|
+
|
18
|
+
### `Feed` Struct
|
19
|
+
|
20
|
+
~~~
|
21
|
+
class Feed
|
22
|
+
attr_accessor :format # e.g. atom|rss 2.0|etc.
|
23
|
+
attr_accessor :title
|
24
|
+
attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
|
25
|
+
attr_accessor :url
|
26
|
+
|
27
|
+
attr_accessor :items
|
28
|
+
|
29
|
+
attr_accessor :summary # e.g. description (rss)
|
30
|
+
attr_accessor :summary_type # e.g. text|html|html-escaped
|
31
|
+
|
32
|
+
attr_accessor :title2 # e.g. subtitle (atom)
|
33
|
+
attr_accessor :title2_type # e.g. text|html|html-escaped
|
34
|
+
|
35
|
+
attr_accessor :published
|
36
|
+
attr_accessor :updated
|
37
|
+
attr_accessor :built
|
38
|
+
|
39
|
+
attr_accessor :generator
|
40
|
+
attr_accessor :generator_version # e.g. @version (atom)
|
41
|
+
attr_accessor :generator_uri # e.g. @uri (atom) - use alias url/link ???
|
42
|
+
end
|
43
|
+
~~~
|
44
|
+
|
45
|
+
|
46
|
+
### `Item` Struct
|
47
|
+
|
48
|
+
~~~
|
49
|
+
class Item
|
50
|
+
attr_accessor :title
|
51
|
+
attr_accessor :title_type # optional for now (text|html|html-escaped) - not yet set
|
52
|
+
attr_accessor :url # todo: rename to link (use alias) ??
|
53
|
+
|
54
|
+
attr_accessor :content
|
55
|
+
attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
|
56
|
+
|
57
|
+
attr_accessor :summary
|
58
|
+
attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
|
59
|
+
|
60
|
+
attr_accessor :published
|
61
|
+
attr_accessor :updated
|
62
|
+
|
63
|
+
attr_accessor :guid # todo: rename to id (use alias) ??
|
64
|
+
end
|
65
|
+
~~~
|
66
|
+
|
67
|
+
|
68
|
+
### Read Feed Example
|
69
|
+
|
70
|
+
~~~
|
71
|
+
require 'open-uri'
|
72
|
+
require 'feedutils'
|
73
|
+
|
74
|
+
xml = open( 'http://openfootball.github.io/atom.xml' ).read
|
75
|
+
|
76
|
+
feed = FeedUtils::Parser.parse( xml )
|
77
|
+
pp feed
|
78
|
+
~~~
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
## Alternatives
|
83
|
+
|
84
|
+
- [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
|
85
|
+
- [`simple-rss`](http://rubyforge.org/projects/simple-rss)
|
86
|
+
- [`feedtools`](http://rubyforge.org/projects/feedtools)
|
87
|
+
|
88
|
+
TBD
|
89
|
+
|
90
|
+
|
91
|
+
## License
|
92
|
+
|
93
|
+
The `feedutils` scripts are dedicated to the public domain.
|
94
|
+
Use it as you please with no restrictions whatsoever.
|
95
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'hoe'
|
2
|
+
require './lib/feedparser/version.rb'
|
3
|
+
|
4
|
+
Hoe.spec 'feedparser' do
|
5
|
+
|
6
|
+
self.version = FeedParser::VERSION
|
7
|
+
|
8
|
+
self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, etc.)'
|
9
|
+
self.description = summary
|
10
|
+
|
11
|
+
self.urls = ['https://github.com/feedreader/feed.parser']
|
12
|
+
|
13
|
+
self.author = 'Gerald Bauer'
|
14
|
+
self.email = 'feedreader@googlegroups.com'
|
15
|
+
|
16
|
+
# switch extension to .markdown for gihub formatting
|
17
|
+
self.readme_file = 'README.md'
|
18
|
+
self.history_file = 'HISTORY.md'
|
19
|
+
|
20
|
+
self.extra_deps = [
|
21
|
+
['logutils', '>= 0.6.1']
|
22
|
+
]
|
23
|
+
|
24
|
+
### todo: add fetcher dep for testing (e.g. development only)
|
25
|
+
|
26
|
+
self.licenses = ['Public Domain']
|
27
|
+
|
28
|
+
self.spec_extras = {
|
29
|
+
required_ruby_version: '>= 1.9.2'
|
30
|
+
}
|
31
|
+
|
32
|
+
end
|
data/lib/feedparser.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# core and stdlibs
|
2
|
+
|
3
|
+
require 'rss'
|
4
|
+
require 'pp'
|
5
|
+
require 'date'
|
6
|
+
|
7
|
+
# 3rd party gems/libs
|
8
|
+
|
9
|
+
require 'logutils'
|
10
|
+
|
11
|
+
# our own code
|
12
|
+
|
13
|
+
require 'feedparser/version' # let it always go first
|
14
|
+
|
15
|
+
require 'feedparser/builder/atom'
|
16
|
+
require 'feedparser/builder/rss'
|
17
|
+
|
18
|
+
require 'feedparser/helper/atom_v03'
|
19
|
+
|
20
|
+
require 'feedparser/feed'
|
21
|
+
require 'feedparser/item'
|
22
|
+
require 'feedparser/parser'
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
# say hello
|
27
|
+
puts FeedParser.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
|
@@ -0,0 +1,141 @@
|
|
1
|
+
|
2
|
+
module FeedParser
|
3
|
+
|
4
|
+
class AtomFeedBuilder
|
5
|
+
|
6
|
+
include LogUtils::Logging
|
7
|
+
|
8
|
+
def initialize( atom_feed )
|
9
|
+
@feed = build_feed( atom_feed )
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_feed
|
13
|
+
@feed
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.build( atom_feed )
|
17
|
+
feed = self.new( atom_feed )
|
18
|
+
feed.to_feed
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def build_feed( atom_feed )
|
23
|
+
feed = Feed.new
|
24
|
+
## feed.object = atom_feed # not use for now
|
25
|
+
feed.format = 'atom'
|
26
|
+
|
27
|
+
feed.title = atom_feed.title.content
|
28
|
+
logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
|
29
|
+
|
30
|
+
|
31
|
+
logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
|
32
|
+
|
33
|
+
feed.url = nil
|
34
|
+
|
35
|
+
## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
|
36
|
+
atom_feed.links.each_with_index do |link,i|
|
37
|
+
logger.debug " atom | link[#{i+1}] link rel=>#{link.rel}< : #{link.rel.class.name} type=#{link.type} href=#{link.href}"
|
38
|
+
|
39
|
+
## for now assume alternate is link or no rel specified (assumes alternate)
|
40
|
+
## note: only set if feed.url is NOT already set (via <id> for example)
|
41
|
+
if feed.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
|
42
|
+
feed.url = link.href
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
## note: as fallback try id if still no url found
|
47
|
+
## use url only if starts_with http
|
48
|
+
## might not be link e.g blogger uses for ids =>
|
49
|
+
## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
|
50
|
+
##
|
51
|
+
## note: id might actually be link to feed NOT to site (remove fallback - why - why not???)
|
52
|
+
##
|
53
|
+
## Note: remove (strip) leading and trailing spaces and newlines
|
54
|
+
|
55
|
+
if feed.url.nil? && atom_feed.id.content.strip.start_with?( 'http' )
|
56
|
+
feed.url = atom_feed.id.content.strip
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
if atom_feed.updated
|
61
|
+
# NOTE: empty updated.content e.g. used by google groups feed
|
62
|
+
# will return nil : NilClass
|
63
|
+
|
64
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
65
|
+
|
66
|
+
feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
67
|
+
logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
|
68
|
+
end
|
69
|
+
|
70
|
+
if atom_feed.generator
|
71
|
+
## Note: remove (strip) leading and trailing spaces and newlines
|
72
|
+
feed.generator = atom_feed.generator.content.strip
|
73
|
+
logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
|
74
|
+
|
75
|
+
# pp atom_feed.generator
|
76
|
+
feed.generator_version = atom_feed.generator.version
|
77
|
+
feed.generator_uri = atom_feed.generator.uri
|
78
|
+
logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
|
79
|
+
logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
|
80
|
+
end
|
81
|
+
|
82
|
+
if atom_feed.subtitle
|
83
|
+
feed.title2 = atom_feed.subtitle.content
|
84
|
+
logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
items = []
|
89
|
+
atom_feed.items.each do |atom_item|
|
90
|
+
items << build_feed_item( atom_item )
|
91
|
+
end
|
92
|
+
feed.items = items
|
93
|
+
|
94
|
+
feed # return new feed
|
95
|
+
end # method build_feed_from_atom
|
96
|
+
|
97
|
+
def build_feed_item( atom_item )
|
98
|
+
item = Item.new # Item.new
|
99
|
+
## item.object = atom_item # not used for now
|
100
|
+
|
101
|
+
item.title = atom_item.title.content
|
102
|
+
item.url = atom_item.link.href
|
103
|
+
|
104
|
+
logger.debug " atom | item.title.content: >#{atom_item.title.content}< : #{atom_item.title.content.class.name}"
|
105
|
+
logger.debug " atom | item.link.href: >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
|
106
|
+
|
107
|
+
|
108
|
+
if atom_item.updated
|
109
|
+
## change time to utc if present? why? why not?
|
110
|
+
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
111
|
+
|
112
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
113
|
+
|
114
|
+
item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
|
115
|
+
logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
|
116
|
+
end
|
117
|
+
|
118
|
+
if atom_item.published
|
119
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
120
|
+
|
121
|
+
item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
|
122
|
+
logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
item.guid = atom_item.id.content
|
127
|
+
logger.debug " atom | item.id.content: >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
|
128
|
+
|
129
|
+
if atom_item.content
|
130
|
+
item.content = atom_item.content.content
|
131
|
+
end
|
132
|
+
|
133
|
+
if atom_item.summary
|
134
|
+
item.summary = atom_item.summary.content
|
135
|
+
end
|
136
|
+
|
137
|
+
item
|
138
|
+
end # method build_feed_item
|
139
|
+
|
140
|
+
end # AtomFeedBuilder
|
141
|
+
end # FeedParser
|
@@ -0,0 +1,129 @@
|
|
1
|
+
|
2
|
+
module FeedParser
|
3
|
+
|
4
|
+
### todo/fix:
|
5
|
+
# rename to Rss20FeedBuilder?? or FeedBuilderRss20 ??
|
6
|
+
|
7
|
+
class RssFeedBuilder
|
8
|
+
|
9
|
+
include LogUtils::Logging
|
10
|
+
|
11
|
+
def initialize( rss_feed )
|
12
|
+
@feed = build_feed( rss_feed )
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_feed
|
16
|
+
@feed
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.build( rss_feed )
|
20
|
+
feed = self.new( rss_feed )
|
21
|
+
feed.to_feed
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def build_feed( rss_feed )
|
26
|
+
feed = Feed.new
|
27
|
+
## feed.object = rss_feed # not use for now
|
28
|
+
feed.format = "rss #{rss_feed.rss_version}"
|
29
|
+
|
30
|
+
feed.title = rss_feed.channel.title # required
|
31
|
+
feed.url = rss_feed.channel.link # required
|
32
|
+
feed.summary = rss_feed.channel.description # required
|
33
|
+
|
34
|
+
logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
|
35
|
+
|
36
|
+
# NOTE:
|
37
|
+
# All date-times in RSS conform
|
38
|
+
# to the Date and Time Specification of RFC 822
|
39
|
+
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
40
|
+
# Sat, 07 Sep 2013 00:00:01 GMT
|
41
|
+
|
42
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
43
|
+
|
44
|
+
feed.built = rss_feed.channel.lastBuildDate.nil? ? nil : rss_feed.channel.lastBuildDate.to_datetime # optional
|
45
|
+
feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
|
46
|
+
|
47
|
+
logger.debug " rss | channel.lastBuildDate: >#{rss_feed.channel.lastBuildDate}< : #{rss_feed.channel.lastBuildDate.class.name}"
|
48
|
+
logger.debug " rss | channel.pubDate: >#{rss_feed.channel.pubDate}< : #{rss_feed.channel.pubDate.class.name}"
|
49
|
+
|
50
|
+
|
51
|
+
feed.generator = rss_feed.channel.generator # optional
|
52
|
+
|
53
|
+
logger.debug " rss | channel.generator: >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
|
54
|
+
|
55
|
+
|
56
|
+
items = []
|
57
|
+
rss_feed.items.each do |rss_item|
|
58
|
+
items << build_feed_item( rss_item )
|
59
|
+
end
|
60
|
+
feed.items = items
|
61
|
+
|
62
|
+
feed # return new feed
|
63
|
+
end
|
64
|
+
|
65
|
+
def build_feed_item( rss_item )
|
66
|
+
|
67
|
+
item = Item.new
|
68
|
+
## item.object = rss_item # not use for now
|
69
|
+
|
70
|
+
item.title = rss_item.title
|
71
|
+
item.url = rss_item.link
|
72
|
+
|
73
|
+
logger.debug " rss | item.title: >#{rss_item.title}< : #{rss_item.title.class.name}"
|
74
|
+
logger.debug " rss | item.link: >#{rss_item.link}< : #{rss_item.link.class.name}"
|
75
|
+
|
76
|
+
## todo:
|
77
|
+
## check if feedburner:origLink present - if yes, use it for url/link
|
78
|
+
## example: use
|
79
|
+
## - <feedburner:origLink>http://www.rubyflow.com/items/9803-gotta-ruby-s-syntax</feedburner:origLink></item>
|
80
|
+
## instead of
|
81
|
+
## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
|
82
|
+
|
83
|
+
|
84
|
+
item.summary = rss_item.description
|
85
|
+
|
86
|
+
# check for <content:encoded>
|
87
|
+
# -- using RSS 1.0 content module in RSS 2.0
|
88
|
+
item.content = rss_item.content_encoded
|
89
|
+
logger.debug " rss | item.content_encoded[0..40]: >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
|
90
|
+
|
91
|
+
# NOTE:
|
92
|
+
# All date-times in RSS conform
|
93
|
+
# to the Date and Time Specification of RFC 822
|
94
|
+
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
95
|
+
# Sat, 07 Sep 2013 00:00:01 GMT
|
96
|
+
|
97
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
98
|
+
|
99
|
+
item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
100
|
+
|
101
|
+
logger.debug " rss | item.pubDate: >#{rss_item.pubDate}< : #{rss_item.pubDate.class.name}"
|
102
|
+
|
103
|
+
|
104
|
+
## fix/todo: check if rss_item.guid present? !!!!
|
105
|
+
##
|
106
|
+
## might be the case e.g. check lambda-the-ultimate.org, for example
|
107
|
+
|
108
|
+
if rss_item.guid && rss_item.guid.content
|
109
|
+
item.guid = rss_item.guid.content
|
110
|
+
logger.debug " rss | item.guid.content: >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
|
111
|
+
else
|
112
|
+
item.guid = rss_item.link
|
113
|
+
logger.warn " rss | item.guid.content missing !!!! - using link for guid"
|
114
|
+
end
|
115
|
+
|
116
|
+
### todo: add support or authors (incl. dc:creator)
|
117
|
+
## <dc:creator>Dhaivat Pandya</dc:creator>
|
118
|
+
|
119
|
+
# todo: categories
|
120
|
+
# <category><![CDATA[Gems]]></category>
|
121
|
+
# <category><![CDATA[Ruby]]></category>
|
122
|
+
# <category><![CDATA[Ruby on Rails]]></category>
|
123
|
+
|
124
|
+
|
125
|
+
item
|
126
|
+
end # method build_feed_item_from_rss
|
127
|
+
|
128
|
+
end # class RssFeedBuilder
|
129
|
+
end # module FeedParser
|