feedparser 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +9 -2
- data/README.md +69 -27
- data/Rakefile +2 -1
- data/lib/feedparser.rb +7 -1
- data/lib/feedparser/builder/atom.rb +92 -42
- data/lib/feedparser/builder/rss.rb +70 -42
- data/lib/feedparser/feed.rb +7 -14
- data/lib/feedparser/item.rb +9 -12
- data/lib/feedparser/parser.rb +3 -2
- data/lib/feedparser/version.rb +3 -3
- data/test/feeds/googlegroups.atom +19 -0
- data/test/feeds/googlegroups2.atom +7 -0
- data/test/feeds/headius.atom +122 -0
- data/test/feeds/lambdatheultimate.rss2 +288 -0
- data/test/feeds/railstutorial.atom +655 -0
- data/test/feeds/rubyflow.rss2 +116 -0
- data/test/feeds/rubymine.rss2 +315 -0
- data/test/feeds/rubyonrails.atom +1241 -0
- data/test/feeds/sitepoint.rss2 +219 -0
- data/test/helper.rb +36 -9
- data/test/test_atom.rb +14 -32
- data/test/test_atom_live.rb +45 -0
- data/test/test_rss.rb +13 -25
- data/test/test_rss_live.rb +38 -0
- metadata +27 -5
- data/test/feeds/quirksblog.atom.v03 +0 -1098
- data/test/test_atom_from_file.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8986d2c787017a536660de47dc443469b6eb2a0a
|
4
|
+
data.tar.gz: 94441f640c433a63de4f94ffcfbd993af0e4139b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0643b97dc231542d7e0c426191afa625e1114b60c2828abd9615f339509f63d36f6c44fa463877c3932dd9ef6739295eaba24af98d2c942f382398c7e91352d0
|
7
|
+
data.tar.gz: fc838c9f2e4875ff8def0c2855b69bd1786d2391ce9298480cf9b6782915fd88c91c22f961c9f59c17b5e16fcb8b3eb14a0929263c4e2b98fe86cd1d7a26a0ab
|
data/Manifest.txt
CHANGED
@@ -11,8 +11,15 @@ lib/feedparser/parser.rb
|
|
11
11
|
lib/feedparser/version.rb
|
12
12
|
test/feeds/googlegroups.atom
|
13
13
|
test/feeds/googlegroups2.atom
|
14
|
-
test/feeds/
|
14
|
+
test/feeds/headius.atom
|
15
|
+
test/feeds/lambdatheultimate.rss2
|
16
|
+
test/feeds/railstutorial.atom
|
17
|
+
test/feeds/rubyflow.rss2
|
18
|
+
test/feeds/rubymine.rss2
|
19
|
+
test/feeds/rubyonrails.atom
|
20
|
+
test/feeds/sitepoint.rss2
|
15
21
|
test/helper.rb
|
16
22
|
test/test_atom.rb
|
17
|
-
test/
|
23
|
+
test/test_atom_live.rb
|
18
24
|
test/test_rss.rb
|
25
|
+
test/test_rss_live.rb
|
data/README.md
CHANGED
@@ -17,24 +17,49 @@ Feed • Item
|
|
17
17
|
|
18
18
|
### `Feed` Struct
|
19
19
|
|
20
|
+
#### Mappings
|
21
|
+
|
22
|
+
Note: uses question mark (`?`) for optional elements (otherwise assume required elements)
|
23
|
+
|
24
|
+
**Title 'n' Summary**
|
25
|
+
|
26
|
+
Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
|
27
|
+
description (RSS 2.0) and subtitle (Atom) content and will unescape HTML entities e.g. `&` becomes & and so on - always
|
28
|
+
resulting in plain vanilla text.
|
29
|
+
|
30
|
+
| Feed Struct | RSS 2.0 | Notes | Atom | Notes |
|
31
|
+
| ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
|
32
|
+
| `feed.title` | `title` | plain vanilla text | `title` | plain vanilla text |
|
33
|
+
| `feed.summary` | `description` | plain vanilla text | `subtitle`? | plain vanilla text |
|
34
|
+
|
35
|
+
|
36
|
+
**Dates**
|
37
|
+
|
38
|
+
| Feed Struct | RSS 2.0 | Notes | Atom | Notes |
|
39
|
+
| ------------------ | ------------------- | ----------------- | ---------- | --------------- |
|
40
|
+
| `feed.updated` | `lastBuildDate`? | RFC-822 format | `updated` | ISO 801 format |
|
41
|
+
| `feed.published` | `pubDate`? | RFC-822 format | - | |
|
42
|
+
|
43
|
+
Note: Check - for RSS 2.0 set feed.updated to pubDate or lastBuildDate if only one present? if both present - map as above.
|
44
|
+
|
45
|
+
|
46
|
+
RFC-822 date format e.g. Wed, 14 Jan 2015 19:48:57 +0100
|
47
|
+
|
48
|
+
ISO-801 date format e.g. 2015-01-11T09:30:16Z
|
49
|
+
|
50
|
+
|
20
51
|
~~~
|
21
52
|
class Feed
|
22
53
|
attr_accessor :format # e.g. atom|rss 2.0|etc.
|
23
|
-
attr_accessor :title
|
24
|
-
attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
|
54
|
+
attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities unescaped
|
25
55
|
attr_accessor :url
|
26
56
|
|
27
57
|
attr_accessor :items
|
28
58
|
|
29
|
-
attr_accessor :summary
|
30
|
-
attr_accessor :summary_type # e.g. text|html|html-escaped
|
31
|
-
|
32
|
-
attr_accessor :title2 # e.g. subtitle (atom)
|
33
|
-
attr_accessor :title2_type # e.g. text|html|html-escaped
|
59
|
+
attr_accessor :summary # note: is description in RSS 2.0 and subtitle in Atom; always plain vanilla text
|
34
60
|
|
35
|
-
attr_accessor :
|
36
|
-
attr_accessor :
|
37
|
-
attr_accessor :built
|
61
|
+
attr_accessor :updated # note: is lastBuildDate in RSS 2.0
|
62
|
+
attr_accessor :published # note: is pubDate in RSS 2.0; not available in Atom
|
38
63
|
|
39
64
|
attr_accessor :generator
|
40
65
|
attr_accessor :generator_version # e.g. @version (atom)
|
@@ -45,20 +70,48 @@ end
|
|
45
70
|
|
46
71
|
### `Item` Struct
|
47
72
|
|
73
|
+
**Title 'n' Summary**
|
74
|
+
|
75
|
+
Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
|
76
|
+
description (RSS 2.0) and summary (Atom) content
|
77
|
+
and will unescape HTML entities e.g. `&` becomes & and so on - always
|
78
|
+
resulting in plain vanilla text.
|
79
|
+
|
80
|
+
Note: In plain vanilla RSS 2.0 there's no difference between (full) content and summary - everything is wrapped
|
81
|
+
in a description element; however, best practice is using the content "module" from RSS 1.0 inside RSS 2.0.
|
82
|
+
If there's no content module present the feed parser will "clone" the description and use one version for `item.summary` and
|
83
|
+
the clone for `item.content`.
|
84
|
+
|
85
|
+
Note: The content element will assume html content.
|
86
|
+
|
87
|
+
| Feed Struct | RSS 2.0 | Notes | Atom | Notes |
|
88
|
+
| ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
|
89
|
+
| `item.title` | `title` | plain vanilla text | `title` | plain vanilla text |
|
90
|
+
| `item.summary` | `description` | plain vanilla text | `summary`? | plain vanilla text |
|
91
|
+
| `item.content` | `content`? | html | `content`? | html |
|
92
|
+
|
93
|
+
|
94
|
+
**Dates**
|
95
|
+
|
96
|
+
| Item Struct | RSS 2.0 | Notes | Atom | Notes |
|
97
|
+
| ------------------ | ------------------- | ----------------- | ------------- | --------------- |
|
98
|
+
| `item.updated` | `pubDate`? | RFC-822 format | `updated` | ISO 801 format |
|
99
|
+
| `item.published` | - | RFC-822 format | `published`? | ISO 801 format |
|
100
|
+
|
101
|
+
Note: In plain vanilla RSS 2.0 there's only one `pubDate` for items, thus, it's not possible to differeniate between published and updated dates for items; note - the `item.pubDate` will get mapped to `item.updated`. To set the published date in RSS 2.0 use the dublin core module e.g `dc:created`, for example.
|
102
|
+
|
48
103
|
~~~
|
49
104
|
class Item
|
50
|
-
attr_accessor :title
|
51
|
-
attr_accessor :
|
52
|
-
attr_accessor :url # todo: rename to link (use alias) ??
|
105
|
+
attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities
|
106
|
+
attr_accessor :url
|
53
107
|
|
54
108
|
attr_accessor :content
|
55
109
|
attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
|
56
110
|
|
57
111
|
attr_accessor :summary
|
58
|
-
attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
|
59
112
|
|
60
|
-
attr_accessor :
|
61
|
-
attr_accessor :
|
113
|
+
attr_accessor :updated # note: is pubDate in RSS 2.0 and updated in Atom
|
114
|
+
attr_accessor :published # note: is published in Atom; not available in RSS 2.0 (use dc:created ??)
|
62
115
|
|
63
116
|
attr_accessor :guid # todo: rename to id (use alias) ??
|
64
117
|
end
|
@@ -78,17 +131,6 @@ pp feed
|
|
78
131
|
~~~
|
79
132
|
|
80
133
|
|
81
|
-
|
82
|
-
## Alternatives
|
83
|
-
|
84
|
-
- [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
|
85
|
-
- [`simple-rss`](http://rubyforge.org/projects/simple-rss)
|
86
|
-
- [`feedtools`](http://rubyforge.org/projects/feedtools)
|
87
|
-
|
88
|
-
TBD
|
89
|
-
|
90
|
-
|
91
|
-
|
92
134
|
## Install
|
93
135
|
|
94
136
|
Just install the gem:
|
data/Rakefile
CHANGED
data/lib/feedparser.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
1
4
|
# core and stdlibs
|
2
5
|
|
3
6
|
require 'rss'
|
4
7
|
require 'pp'
|
5
|
-
require '
|
8
|
+
require 'time' # note: ruby has a builtin core time class and a stdlib time class pack; require stdlib extensions
|
9
|
+
require 'date' # note: ruby has a builtin core date class and a stdlib date class pack; require stdlib extensions
|
6
10
|
|
7
11
|
# 3rd party gems/libs
|
8
12
|
|
9
13
|
require 'logutils'
|
14
|
+
require 'textutils'
|
15
|
+
|
10
16
|
|
11
17
|
# our own code
|
12
18
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
|
2
3
|
module FeedParser
|
3
4
|
|
@@ -5,6 +6,12 @@ class AtomFeedBuilder
|
|
5
6
|
|
6
7
|
include LogUtils::Logging
|
7
8
|
|
9
|
+
|
10
|
+
def self.build( atom_feed )
|
11
|
+
feed = self.new( atom_feed )
|
12
|
+
feed.to_feed
|
13
|
+
end
|
14
|
+
|
8
15
|
def initialize( atom_feed )
|
9
16
|
@feed = build_feed( atom_feed )
|
10
17
|
end
|
@@ -13,28 +20,21 @@ class AtomFeedBuilder
|
|
13
20
|
@feed
|
14
21
|
end
|
15
22
|
|
16
|
-
def self.build( atom_feed )
|
17
|
-
feed = self.new( atom_feed )
|
18
|
-
feed.to_feed
|
19
|
-
end
|
20
23
|
|
21
24
|
|
22
25
|
def build_feed( atom_feed )
|
23
26
|
feed = Feed.new
|
24
|
-
## feed.object = atom_feed # not use for now
|
25
27
|
feed.format = 'atom'
|
26
28
|
|
27
|
-
feed.title = atom_feed.title.
|
28
|
-
logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
|
29
|
-
|
29
|
+
feed.title = handle_content( atom_feed.title, 'feed.title' )
|
30
30
|
|
31
|
-
logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
|
31
|
+
logger.debug " atom | feed.id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
|
32
32
|
|
33
33
|
feed.url = nil
|
34
34
|
|
35
35
|
## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
|
36
36
|
atom_feed.links.each_with_index do |link,i|
|
37
|
-
logger.debug " atom | link[#{i+1}]
|
37
|
+
logger.debug " atom | feed.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
|
38
38
|
|
39
39
|
## for now assume alternate is link or no rel specified (assumes alternate)
|
40
40
|
## note: only set if feed.url is NOT already set (via <id> for example)
|
@@ -43,7 +43,11 @@ class AtomFeedBuilder
|
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
|
-
|
46
|
+
if feed.url.nil?
|
47
|
+
### todo/fix: issue warning - no link found!!!!
|
48
|
+
end
|
49
|
+
|
50
|
+
## note: as fallback try id if still no url found - why?? why not??
|
47
51
|
## use url only if starts_with http
|
48
52
|
## might not be link e.g blogger uses for ids =>
|
49
53
|
## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
|
@@ -58,30 +62,23 @@ class AtomFeedBuilder
|
|
58
62
|
|
59
63
|
|
60
64
|
if atom_feed.updated
|
61
|
-
|
62
|
-
# will return nil : NilClass
|
63
|
-
|
64
|
-
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
65
|
-
|
66
|
-
feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
67
|
-
logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
|
65
|
+
feed.updated = handle_date( atom_feed.updated, 'feed.updated' )
|
68
66
|
end
|
69
67
|
|
70
68
|
if atom_feed.generator
|
71
69
|
## Note: remove (strip) leading and trailing spaces and newlines
|
72
70
|
feed.generator = atom_feed.generator.content.strip
|
73
|
-
logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
|
71
|
+
logger.debug " atom | feed.generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
|
74
72
|
|
75
73
|
# pp atom_feed.generator
|
76
74
|
feed.generator_version = atom_feed.generator.version
|
77
75
|
feed.generator_uri = atom_feed.generator.uri
|
78
|
-
logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
|
79
|
-
logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
|
76
|
+
logger.debug " atom | feed.generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
|
77
|
+
logger.debug " atom | feed.generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
|
80
78
|
end
|
81
79
|
|
82
80
|
if atom_feed.subtitle
|
83
|
-
feed.
|
84
|
-
logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
|
81
|
+
feed.summary = handle_content( atom_feed.subtitle, 'feed.subtitle => summary' )
|
85
82
|
end
|
86
83
|
|
87
84
|
|
@@ -94,48 +91,101 @@ class AtomFeedBuilder
|
|
94
91
|
feed # return new feed
|
95
92
|
end # method build_feed_from_atom
|
96
93
|
|
94
|
+
|
97
95
|
def build_feed_item( atom_item )
|
98
96
|
item = Item.new # Item.new
|
99
|
-
## item.object = atom_item # not used for now
|
100
97
|
|
101
|
-
item.title = atom_item.title.
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
98
|
+
item.title = handle_content( atom_item.title, 'item.title' )
|
99
|
+
|
100
|
+
## Note: item might have many links
|
101
|
+
## e.g. see blogger (headius)
|
102
|
+
## <link rel='replies' type='application/atom+xml' href='http://blog.headius.com/feeds/3430080308857860963/comments/default' title='Post Comments'/>
|
103
|
+
## <link rel='replies' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html#comment-form' title='0 Comments'/>
|
104
|
+
## <link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
|
105
|
+
## <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
|
106
|
+
## <link rel='alternate' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html'
|
107
|
+
|
108
|
+
item.url = nil
|
109
|
+
|
110
|
+
if atom_item.links.size == 1
|
111
|
+
item.url = atom_item.link.href
|
112
|
+
logger.debug " atom | item.link.href >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
|
113
|
+
else
|
114
|
+
## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
|
115
|
+
atom_item.links.each_with_index do |link,i|
|
116
|
+
logger.debug " atom | item.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
|
117
|
+
## for now assume alternate is link or no rel specified (assumes alternate)
|
118
|
+
## note: only set if feed.url is NOT already set (via <id> for example)
|
119
|
+
if item.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
|
120
|
+
item.url = link.href
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
106
124
|
|
107
125
|
|
108
126
|
if atom_item.updated
|
109
|
-
|
110
|
-
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
111
|
-
|
112
|
-
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
113
|
-
|
114
|
-
item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
|
115
|
-
logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
|
127
|
+
item.updated = handle_date( atom_item.updated, 'item.updated' )
|
116
128
|
end
|
117
129
|
|
118
130
|
if atom_item.published
|
119
|
-
|
120
|
-
|
121
|
-
item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
|
122
|
-
logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
|
131
|
+
item.published = handle_date( atom_item.published, 'item.published' )
|
123
132
|
end
|
124
133
|
|
125
134
|
|
126
135
|
item.guid = atom_item.id.content
|
127
|
-
logger.debug " atom | item.id.content
|
136
|
+
logger.debug " atom | item.id.content >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
|
128
137
|
|
129
138
|
if atom_item.content
|
130
139
|
item.content = atom_item.content.content
|
131
140
|
end
|
132
141
|
|
133
142
|
if atom_item.summary
|
134
|
-
item.summary = atom_item.summary.
|
143
|
+
item.summary = handle_content( atom_item.summary, 'item.summary' )
|
135
144
|
end
|
136
145
|
|
137
146
|
item
|
138
147
|
end # method build_feed_item
|
139
148
|
|
149
|
+
|
150
|
+
|
151
|
+
def handle_date( el, name )
|
152
|
+
## change time to utc if present? why? why not?
|
153
|
+
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
154
|
+
|
155
|
+
###############
|
156
|
+
# examples:
|
157
|
+
# 2015-01-02 01:56:06 +0100
|
158
|
+
|
159
|
+
logger.debug " atom | #{name}.content >#{el.content}< : #{el.content.class.name}"
|
160
|
+
|
161
|
+
# NOTE: empty updated.content possible e.g. used by google groups feed (e.g. <updated></updated>)
|
162
|
+
# will return nil : NilClass
|
163
|
+
|
164
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
165
|
+
date = if el.content.nil?
|
166
|
+
nil
|
167
|
+
else
|
168
|
+
el.content.to_datetime
|
169
|
+
end
|
170
|
+
|
171
|
+
date
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
def handle_content( el, name ) ## rename to handle_plain_vanilla_text_content - why? why not?
|
176
|
+
### todo/fix: if type html ?? strip html tags n attributes
|
177
|
+
## always strip html tags n attributes?? why? why not?
|
178
|
+
|
179
|
+
## check if content.nil? possible e.g. <title></title> => empty string or nil?
|
180
|
+
|
181
|
+
## note: dump head (first 30 chars)
|
182
|
+
logger.debug " atom | #{name}.content[0..30] (type=>#{el.type}<) >#{el.content[0..30]}< : #{el.content.class.name}"
|
183
|
+
|
184
|
+
## note: always strip leading and trailing whitespaces (spaces/tabs/newlines)
|
185
|
+
text = el.content.strip
|
186
|
+
text
|
187
|
+
end
|
188
|
+
|
189
|
+
|
140
190
|
end # AtomFeedBuilder
|
141
191
|
end # FeedParser
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
|
2
3
|
module FeedParser
|
3
4
|
|
@@ -8,6 +9,12 @@ class RssFeedBuilder
|
|
8
9
|
|
9
10
|
include LogUtils::Logging
|
10
11
|
|
12
|
+
|
13
|
+
def self.build( rss_feed )
|
14
|
+
feed = self.new( rss_feed )
|
15
|
+
feed.to_feed
|
16
|
+
end
|
17
|
+
|
11
18
|
def initialize( rss_feed )
|
12
19
|
@feed = build_feed( rss_feed )
|
13
20
|
end
|
@@ -16,41 +23,25 @@ class RssFeedBuilder
|
|
16
23
|
@feed
|
17
24
|
end
|
18
25
|
|
19
|
-
def self.build( rss_feed )
|
20
|
-
feed = self.new( rss_feed )
|
21
|
-
feed.to_feed
|
22
|
-
end
|
23
26
|
|
24
27
|
|
25
28
|
def build_feed( rss_feed )
|
26
29
|
feed = Feed.new
|
27
|
-
## feed.object = rss_feed # not use for now
|
28
30
|
feed.format = "rss #{rss_feed.rss_version}"
|
29
31
|
|
30
|
-
feed.
|
31
|
-
feed.url = rss_feed.channel.link # required
|
32
|
-
feed.summary = rss_feed.channel.description # required
|
33
|
-
|
34
|
-
logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
|
35
|
-
|
36
|
-
# NOTE:
|
37
|
-
# All date-times in RSS conform
|
38
|
-
# to the Date and Time Specification of RFC 822
|
39
|
-
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
40
|
-
# Sat, 07 Sep 2013 00:00:01 GMT
|
32
|
+
logger.debug " rss | feed.version >#{rss_feed.rss_version}<"
|
41
33
|
|
42
|
-
|
43
|
-
|
44
|
-
feed.
|
45
|
-
feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
|
34
|
+
feed.title = handle_content( rss_feed.channel.title, 'feed.title' ) # required
|
35
|
+
feed.summary = handle_content( rss_feed.channel.description, 'feed.description => summary' ) # required
|
36
|
+
feed.url = rss_feed.channel.link # required
|
46
37
|
|
47
|
-
|
48
|
-
|
38
|
+
feed.updated = handle_date( rss_feed.channel.lastBuildDate, 'feed.lastBuildDate => updated' ) # optional
|
39
|
+
feed.published = handle_date( rss_feed.channel.pubDate, 'feed.pubDate => published' ) # optional
|
49
40
|
|
50
41
|
|
51
|
-
feed.generator = rss_feed.channel.generator
|
42
|
+
feed.generator = rss_feed.channel.generator # optional
|
52
43
|
|
53
|
-
logger.debug " rss |
|
44
|
+
logger.debug " rss | feed.generator >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
|
54
45
|
|
55
46
|
|
56
47
|
items = []
|
@@ -65,13 +56,12 @@ class RssFeedBuilder
|
|
65
56
|
def build_feed_item( rss_item )
|
66
57
|
|
67
58
|
item = Item.new
|
68
|
-
## item.object = rss_item # not use for now
|
69
59
|
|
70
|
-
item.title = rss_item.title
|
60
|
+
item.title = handle_content( rss_item.title, 'item.title' )
|
71
61
|
item.url = rss_item.link
|
72
62
|
|
73
|
-
logger.debug " rss | item.
|
74
|
-
|
63
|
+
logger.debug " rss | item.link >#{rss_item.link}< : #{rss_item.link.class.name}"
|
64
|
+
|
75
65
|
|
76
66
|
## todo:
|
77
67
|
## check if feedburner:origLink present - if yes, use it for url/link
|
@@ -81,24 +71,15 @@ class RssFeedBuilder
|
|
81
71
|
## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
|
82
72
|
|
83
73
|
|
84
|
-
item.summary = rss_item.description
|
74
|
+
item.summary = handle_content( rss_item.description, 'item.description => summary' )
|
85
75
|
|
86
76
|
# check for <content:encoded>
|
87
77
|
# -- using RSS 1.0 content module in RSS 2.0
|
88
78
|
item.content = rss_item.content_encoded
|
89
|
-
logger.debug " rss | item.content_encoded[0..40]
|
90
|
-
|
91
|
-
# NOTE:
|
92
|
-
# All date-times in RSS conform
|
93
|
-
# to the Date and Time Specification of RFC 822
|
94
|
-
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
95
|
-
# Sat, 07 Sep 2013 00:00:01 GMT
|
96
|
-
|
97
|
-
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
79
|
+
logger.debug " rss | item.content_encoded[0..40] >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
|
98
80
|
|
99
|
-
item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
100
81
|
|
101
|
-
|
82
|
+
item.updated = handle_date( rss_item.pubDate, 'item.pubDate => updated' )
|
102
83
|
|
103
84
|
|
104
85
|
## fix/todo: check if rss_item.guid present? !!!!
|
@@ -107,7 +88,7 @@ class RssFeedBuilder
|
|
107
88
|
|
108
89
|
if rss_item.guid && rss_item.guid.content
|
109
90
|
item.guid = rss_item.guid.content
|
110
|
-
logger.debug " rss | item.guid.content
|
91
|
+
logger.debug " rss | item.guid.content >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
|
111
92
|
else
|
112
93
|
item.guid = rss_item.link
|
113
94
|
logger.warn " rss | item.guid.content missing !!!! - using link for guid"
|
@@ -121,9 +102,56 @@ class RssFeedBuilder
|
|
121
102
|
# <category><![CDATA[Ruby]]></category>
|
122
103
|
# <category><![CDATA[Ruby on Rails]]></category>
|
123
104
|
|
124
|
-
|
125
105
|
item
|
126
106
|
end # method build_feed_item_from_rss
|
127
107
|
|
108
|
+
|
109
|
+
|
110
|
+
def handle_date( el, name )
|
111
|
+
## change time to utc if present? why? why not?
|
112
|
+
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
113
|
+
|
114
|
+
# NOTE:
|
115
|
+
# All date-times in RSS conform
|
116
|
+
# to the Date and Time Specification of RFC 822
|
117
|
+
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
118
|
+
# Sat, 07 Sep 2013 00:00:01 GMT
|
119
|
+
|
120
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
121
|
+
|
122
|
+
logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
|
123
|
+
|
124
|
+
|
125
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
126
|
+
date = if el.nil?
|
127
|
+
nil
|
128
|
+
else
|
129
|
+
el.to_datetime
|
130
|
+
end
|
131
|
+
|
132
|
+
date
|
133
|
+
end
|
134
|
+
|
135
|
+
def handle_content( el, name )
|
136
|
+
## note:
|
137
|
+
# use for feed.title, feed.description
|
138
|
+
# item.title, item.description
|
139
|
+
#
|
140
|
+
# do NOT use for others e.g. feed.generator, etc.
|
141
|
+
|
142
|
+
|
143
|
+
## todo/fix: strip html tags n attributes ???
|
144
|
+
|
145
|
+
logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
|
146
|
+
|
147
|
+
text = if el.nil?
|
148
|
+
nil
|
149
|
+
else
|
150
|
+
el.strip
|
151
|
+
end
|
152
|
+
text
|
153
|
+
end
|
154
|
+
|
155
|
+
|
128
156
|
end # class RssFeedBuilder
|
129
157
|
end # module FeedParser
|