feedparser 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +9 -2
- data/README.md +69 -27
- data/Rakefile +2 -1
- data/lib/feedparser.rb +7 -1
- data/lib/feedparser/builder/atom.rb +92 -42
- data/lib/feedparser/builder/rss.rb +70 -42
- data/lib/feedparser/feed.rb +7 -14
- data/lib/feedparser/item.rb +9 -12
- data/lib/feedparser/parser.rb +3 -2
- data/lib/feedparser/version.rb +3 -3
- data/test/feeds/googlegroups.atom +19 -0
- data/test/feeds/googlegroups2.atom +7 -0
- data/test/feeds/headius.atom +122 -0
- data/test/feeds/lambdatheultimate.rss2 +288 -0
- data/test/feeds/railstutorial.atom +655 -0
- data/test/feeds/rubyflow.rss2 +116 -0
- data/test/feeds/rubymine.rss2 +315 -0
- data/test/feeds/rubyonrails.atom +1241 -0
- data/test/feeds/sitepoint.rss2 +219 -0
- data/test/helper.rb +36 -9
- data/test/test_atom.rb +14 -32
- data/test/test_atom_live.rb +45 -0
- data/test/test_rss.rb +13 -25
- data/test/test_rss_live.rb +38 -0
- metadata +27 -5
- data/test/feeds/quirksblog.atom.v03 +0 -1098
- data/test/test_atom_from_file.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8986d2c787017a536660de47dc443469b6eb2a0a
|
4
|
+
data.tar.gz: 94441f640c433a63de4f94ffcfbd993af0e4139b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0643b97dc231542d7e0c426191afa625e1114b60c2828abd9615f339509f63d36f6c44fa463877c3932dd9ef6739295eaba24af98d2c942f382398c7e91352d0
|
7
|
+
data.tar.gz: fc838c9f2e4875ff8def0c2855b69bd1786d2391ce9298480cf9b6782915fd88c91c22f961c9f59c17b5e16fcb8b3eb14a0929263c4e2b98fe86cd1d7a26a0ab
|
data/Manifest.txt
CHANGED
@@ -11,8 +11,15 @@ lib/feedparser/parser.rb
|
|
11
11
|
lib/feedparser/version.rb
|
12
12
|
test/feeds/googlegroups.atom
|
13
13
|
test/feeds/googlegroups2.atom
|
14
|
-
test/feeds/
|
14
|
+
test/feeds/headius.atom
|
15
|
+
test/feeds/lambdatheultimate.rss2
|
16
|
+
test/feeds/railstutorial.atom
|
17
|
+
test/feeds/rubyflow.rss2
|
18
|
+
test/feeds/rubymine.rss2
|
19
|
+
test/feeds/rubyonrails.atom
|
20
|
+
test/feeds/sitepoint.rss2
|
15
21
|
test/helper.rb
|
16
22
|
test/test_atom.rb
|
17
|
-
test/
|
23
|
+
test/test_atom_live.rb
|
18
24
|
test/test_rss.rb
|
25
|
+
test/test_rss_live.rb
|
data/README.md
CHANGED
@@ -17,24 +17,49 @@ Feed • Item
|
|
17
17
|
|
18
18
|
### `Feed` Struct
|
19
19
|
|
20
|
+
#### Mappings
|
21
|
+
|
22
|
+
Note: uses question mark (`?`) for optional elements (otherwise assume required elements)
|
23
|
+
|
24
|
+
**Title 'n' Summary**
|
25
|
+
|
26
|
+
Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
|
27
|
+
description (RSS 2.0) and subtitle (Atom) content and will unescape HTML entities e.g. `&` becomes & and so on - always
|
28
|
+
resulting in plain vanilla text.
|
29
|
+
|
30
|
+
| Feed Struct | RSS 2.0 | Notes | Atom | Notes |
|
31
|
+
| ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
|
32
|
+
| `feed.title` | `title` | plain vanilla text | `title` | plain vanilla text |
|
33
|
+
| `feed.summary` | `description` | plain vanilla text | `subtitle`? | plain vanilla text |
|
34
|
+
|
35
|
+
|
36
|
+
**Dates**
|
37
|
+
|
38
|
+
| Feed Struct | RSS 2.0 | Notes | Atom | Notes |
|
39
|
+
| ------------------ | ------------------- | ----------------- | ---------- | --------------- |
|
40
|
+
| `feed.updated` | `lastBuildDate`? | RFC-822 format | `updated` | ISO 801 format |
|
41
|
+
| `feed.published` | `pubDate`? | RFC-822 format | - | |
|
42
|
+
|
43
|
+
Note: Check - for RSS 2.0 set feed.updated to pubDate or lastBuildDate if only one present? if both present - map as above.
|
44
|
+
|
45
|
+
|
46
|
+
RFC-822 date format e.g. Wed, 14 Jan 2015 19:48:57 +0100
|
47
|
+
|
48
|
+
ISO-801 date format e.g. 2015-01-11T09:30:16Z
|
49
|
+
|
50
|
+
|
20
51
|
~~~
|
21
52
|
class Feed
|
22
53
|
attr_accessor :format # e.g. atom|rss 2.0|etc.
|
23
|
-
attr_accessor :title
|
24
|
-
attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
|
54
|
+
attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities unescaped
|
25
55
|
attr_accessor :url
|
26
56
|
|
27
57
|
attr_accessor :items
|
28
58
|
|
29
|
-
attr_accessor :summary
|
30
|
-
attr_accessor :summary_type # e.g. text|html|html-escaped
|
31
|
-
|
32
|
-
attr_accessor :title2 # e.g. subtitle (atom)
|
33
|
-
attr_accessor :title2_type # e.g. text|html|html-escaped
|
59
|
+
attr_accessor :summary # note: is description in RSS 2.0 and subtitle in Atom; always plain vanilla text
|
34
60
|
|
35
|
-
attr_accessor :
|
36
|
-
attr_accessor :
|
37
|
-
attr_accessor :built
|
61
|
+
attr_accessor :updated # note: is lastBuildDate in RSS 2.0
|
62
|
+
attr_accessor :published # note: is pubDate in RSS 2.0; not available in Atom
|
38
63
|
|
39
64
|
attr_accessor :generator
|
40
65
|
attr_accessor :generator_version # e.g. @version (atom)
|
@@ -45,20 +70,48 @@ end
|
|
45
70
|
|
46
71
|
### `Item` Struct
|
47
72
|
|
73
|
+
**Title 'n' Summary**
|
74
|
+
|
75
|
+
Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
|
76
|
+
description (RSS 2.0) and summary (Atom) content
|
77
|
+
and will unescape HTML entities e.g. `&` becomes & and so on - always
|
78
|
+
resulting in plain vanilla text.
|
79
|
+
|
80
|
+
Note: In plain vanilla RSS 2.0 there's no difference between (full) content and summary - everything is wrapped
|
81
|
+
in a description element; however, best practice is using the content "module" from RSS 1.0 inside RSS 2.0.
|
82
|
+
If there's no content module present the feed parser will "clone" the description and use one version for `item.summary` and
|
83
|
+
the clone for `item.content`.
|
84
|
+
|
85
|
+
Note: The content element will assume html content.
|
86
|
+
|
87
|
+
| Feed Struct | RSS 2.0 | Notes | Atom | Notes |
|
88
|
+
| ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
|
89
|
+
| `item.title` | `title` | plain vanilla text | `title` | plain vanilla text |
|
90
|
+
| `item.summary` | `description` | plain vanilla text | `summary`? | plain vanilla text |
|
91
|
+
| `item.content` | `content`? | html | `content`? | html |
|
92
|
+
|
93
|
+
|
94
|
+
**Dates**
|
95
|
+
|
96
|
+
| Item Struct | RSS 2.0 | Notes | Atom | Notes |
|
97
|
+
| ------------------ | ------------------- | ----------------- | ------------- | --------------- |
|
98
|
+
| `item.updated` | `pubDate`? | RFC-822 format | `updated` | ISO 801 format |
|
99
|
+
| `item.published` | - | RFC-822 format | `published`? | ISO 801 format |
|
100
|
+
|
101
|
+
Note: In plain vanilla RSS 2.0 there's only one `pubDate` for items, thus, it's not possible to differeniate between published and updated dates for items; note - the `item.pubDate` will get mapped to `item.updated`. To set the published date in RSS 2.0 use the dublin core module e.g `dc:created`, for example.
|
102
|
+
|
48
103
|
~~~
|
49
104
|
class Item
|
50
|
-
attr_accessor :title
|
51
|
-
attr_accessor :
|
52
|
-
attr_accessor :url # todo: rename to link (use alias) ??
|
105
|
+
attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities
|
106
|
+
attr_accessor :url
|
53
107
|
|
54
108
|
attr_accessor :content
|
55
109
|
attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
|
56
110
|
|
57
111
|
attr_accessor :summary
|
58
|
-
attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
|
59
112
|
|
60
|
-
attr_accessor :
|
61
|
-
attr_accessor :
|
113
|
+
attr_accessor :updated # note: is pubDate in RSS 2.0 and updated in Atom
|
114
|
+
attr_accessor :published # note: is published in Atom; not available in RSS 2.0 (use dc:created ??)
|
62
115
|
|
63
116
|
attr_accessor :guid # todo: rename to id (use alias) ??
|
64
117
|
end
|
@@ -78,17 +131,6 @@ pp feed
|
|
78
131
|
~~~
|
79
132
|
|
80
133
|
|
81
|
-
|
82
|
-
## Alternatives
|
83
|
-
|
84
|
-
- [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
|
85
|
-
- [`simple-rss`](http://rubyforge.org/projects/simple-rss)
|
86
|
-
- [`feedtools`](http://rubyforge.org/projects/feedtools)
|
87
|
-
|
88
|
-
TBD
|
89
|
-
|
90
|
-
|
91
|
-
|
92
134
|
## Install
|
93
135
|
|
94
136
|
Just install the gem:
|
data/Rakefile
CHANGED
data/lib/feedparser.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
1
4
|
# core and stdlibs
|
2
5
|
|
3
6
|
require 'rss'
|
4
7
|
require 'pp'
|
5
|
-
require '
|
8
|
+
require 'time' # note: ruby has a builtin core time class and a stdlib time class pack; require stdlib extensions
|
9
|
+
require 'date' # note: ruby has a builtin core date class and a stdlib date class pack; require stdlib extensions
|
6
10
|
|
7
11
|
# 3rd party gems/libs
|
8
12
|
|
9
13
|
require 'logutils'
|
14
|
+
require 'textutils'
|
15
|
+
|
10
16
|
|
11
17
|
# our own code
|
12
18
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
|
2
3
|
module FeedParser
|
3
4
|
|
@@ -5,6 +6,12 @@ class AtomFeedBuilder
|
|
5
6
|
|
6
7
|
include LogUtils::Logging
|
7
8
|
|
9
|
+
|
10
|
+
def self.build( atom_feed )
|
11
|
+
feed = self.new( atom_feed )
|
12
|
+
feed.to_feed
|
13
|
+
end
|
14
|
+
|
8
15
|
def initialize( atom_feed )
|
9
16
|
@feed = build_feed( atom_feed )
|
10
17
|
end
|
@@ -13,28 +20,21 @@ class AtomFeedBuilder
|
|
13
20
|
@feed
|
14
21
|
end
|
15
22
|
|
16
|
-
def self.build( atom_feed )
|
17
|
-
feed = self.new( atom_feed )
|
18
|
-
feed.to_feed
|
19
|
-
end
|
20
23
|
|
21
24
|
|
22
25
|
def build_feed( atom_feed )
|
23
26
|
feed = Feed.new
|
24
|
-
## feed.object = atom_feed # not use for now
|
25
27
|
feed.format = 'atom'
|
26
28
|
|
27
|
-
feed.title = atom_feed.title.
|
28
|
-
logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
|
29
|
-
|
29
|
+
feed.title = handle_content( atom_feed.title, 'feed.title' )
|
30
30
|
|
31
|
-
logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
|
31
|
+
logger.debug " atom | feed.id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
|
32
32
|
|
33
33
|
feed.url = nil
|
34
34
|
|
35
35
|
## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
|
36
36
|
atom_feed.links.each_with_index do |link,i|
|
37
|
-
logger.debug " atom | link[#{i+1}]
|
37
|
+
logger.debug " atom | feed.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
|
38
38
|
|
39
39
|
## for now assume alternate is link or no rel specified (assumes alternate)
|
40
40
|
## note: only set if feed.url is NOT already set (via <id> for example)
|
@@ -43,7 +43,11 @@ class AtomFeedBuilder
|
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
|
-
|
46
|
+
if feed.url.nil?
|
47
|
+
### todo/fix: issue warning - no link found!!!!
|
48
|
+
end
|
49
|
+
|
50
|
+
## note: as fallback try id if still no url found - why?? why not??
|
47
51
|
## use url only if starts_with http
|
48
52
|
## might not be link e.g blogger uses for ids =>
|
49
53
|
## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
|
@@ -58,30 +62,23 @@ class AtomFeedBuilder
|
|
58
62
|
|
59
63
|
|
60
64
|
if atom_feed.updated
|
61
|
-
|
62
|
-
# will return nil : NilClass
|
63
|
-
|
64
|
-
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
65
|
-
|
66
|
-
feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
67
|
-
logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
|
65
|
+
feed.updated = handle_date( atom_feed.updated, 'feed.updated' )
|
68
66
|
end
|
69
67
|
|
70
68
|
if atom_feed.generator
|
71
69
|
## Note: remove (strip) leading and trailing spaces and newlines
|
72
70
|
feed.generator = atom_feed.generator.content.strip
|
73
|
-
logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
|
71
|
+
logger.debug " atom | feed.generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
|
74
72
|
|
75
73
|
# pp atom_feed.generator
|
76
74
|
feed.generator_version = atom_feed.generator.version
|
77
75
|
feed.generator_uri = atom_feed.generator.uri
|
78
|
-
logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
|
79
|
-
logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
|
76
|
+
logger.debug " atom | feed.generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
|
77
|
+
logger.debug " atom | feed.generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
|
80
78
|
end
|
81
79
|
|
82
80
|
if atom_feed.subtitle
|
83
|
-
feed.
|
84
|
-
logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
|
81
|
+
feed.summary = handle_content( atom_feed.subtitle, 'feed.subtitle => summary' )
|
85
82
|
end
|
86
83
|
|
87
84
|
|
@@ -94,48 +91,101 @@ class AtomFeedBuilder
|
|
94
91
|
feed # return new feed
|
95
92
|
end # method build_feed_from_atom
|
96
93
|
|
94
|
+
|
97
95
|
def build_feed_item( atom_item )
|
98
96
|
item = Item.new # Item.new
|
99
|
-
## item.object = atom_item # not used for now
|
100
97
|
|
101
|
-
item.title = atom_item.title.
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
98
|
+
item.title = handle_content( atom_item.title, 'item.title' )
|
99
|
+
|
100
|
+
## Note: item might have many links
|
101
|
+
## e.g. see blogger (headius)
|
102
|
+
## <link rel='replies' type='application/atom+xml' href='http://blog.headius.com/feeds/3430080308857860963/comments/default' title='Post Comments'/>
|
103
|
+
## <link rel='replies' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html#comment-form' title='0 Comments'/>
|
104
|
+
## <link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
|
105
|
+
## <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
|
106
|
+
## <link rel='alternate' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html'
|
107
|
+
|
108
|
+
item.url = nil
|
109
|
+
|
110
|
+
if atom_item.links.size == 1
|
111
|
+
item.url = atom_item.link.href
|
112
|
+
logger.debug " atom | item.link.href >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
|
113
|
+
else
|
114
|
+
## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
|
115
|
+
atom_item.links.each_with_index do |link,i|
|
116
|
+
logger.debug " atom | item.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
|
117
|
+
## for now assume alternate is link or no rel specified (assumes alternate)
|
118
|
+
## note: only set if feed.url is NOT already set (via <id> for example)
|
119
|
+
if item.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
|
120
|
+
item.url = link.href
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
106
124
|
|
107
125
|
|
108
126
|
if atom_item.updated
|
109
|
-
|
110
|
-
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
111
|
-
|
112
|
-
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
113
|
-
|
114
|
-
item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
|
115
|
-
logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
|
127
|
+
item.updated = handle_date( atom_item.updated, 'item.updated' )
|
116
128
|
end
|
117
129
|
|
118
130
|
if atom_item.published
|
119
|
-
|
120
|
-
|
121
|
-
item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
|
122
|
-
logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
|
131
|
+
item.published = handle_date( atom_item.published, 'item.published' )
|
123
132
|
end
|
124
133
|
|
125
134
|
|
126
135
|
item.guid = atom_item.id.content
|
127
|
-
logger.debug " atom | item.id.content
|
136
|
+
logger.debug " atom | item.id.content >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
|
128
137
|
|
129
138
|
if atom_item.content
|
130
139
|
item.content = atom_item.content.content
|
131
140
|
end
|
132
141
|
|
133
142
|
if atom_item.summary
|
134
|
-
item.summary = atom_item.summary.
|
143
|
+
item.summary = handle_content( atom_item.summary, 'item.summary' )
|
135
144
|
end
|
136
145
|
|
137
146
|
item
|
138
147
|
end # method build_feed_item
|
139
148
|
|
149
|
+
|
150
|
+
|
151
|
+
def handle_date( el, name )
|
152
|
+
## change time to utc if present? why? why not?
|
153
|
+
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
154
|
+
|
155
|
+
###############
|
156
|
+
# examples:
|
157
|
+
# 2015-01-02 01:56:06 +0100
|
158
|
+
|
159
|
+
logger.debug " atom | #{name}.content >#{el.content}< : #{el.content.class.name}"
|
160
|
+
|
161
|
+
# NOTE: empty updated.content possible e.g. used by google groups feed (e.g. <updated></updated>)
|
162
|
+
# will return nil : NilClass
|
163
|
+
|
164
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
165
|
+
date = if el.content.nil?
|
166
|
+
nil
|
167
|
+
else
|
168
|
+
el.content.to_datetime
|
169
|
+
end
|
170
|
+
|
171
|
+
date
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
def handle_content( el, name ) ## rename to handle_plain_vanilla_text_content - why? why not?
|
176
|
+
### todo/fix: if type html ?? strip html tags n attributes
|
177
|
+
## always strip html tags n attributes?? why? why not?
|
178
|
+
|
179
|
+
## check if content.nil? possible e.g. <title></title> => empty string or nil?
|
180
|
+
|
181
|
+
## note: dump head (first 30 chars)
|
182
|
+
logger.debug " atom | #{name}.content[0..30] (type=>#{el.type}<) >#{el.content[0..30]}< : #{el.content.class.name}"
|
183
|
+
|
184
|
+
## note: always strip leading and trailing whitespaces (spaces/tabs/newlines)
|
185
|
+
text = el.content.strip
|
186
|
+
text
|
187
|
+
end
|
188
|
+
|
189
|
+
|
140
190
|
end # AtomFeedBuilder
|
141
191
|
end # FeedParser
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
|
2
3
|
module FeedParser
|
3
4
|
|
@@ -8,6 +9,12 @@ class RssFeedBuilder
|
|
8
9
|
|
9
10
|
include LogUtils::Logging
|
10
11
|
|
12
|
+
|
13
|
+
def self.build( rss_feed )
|
14
|
+
feed = self.new( rss_feed )
|
15
|
+
feed.to_feed
|
16
|
+
end
|
17
|
+
|
11
18
|
def initialize( rss_feed )
|
12
19
|
@feed = build_feed( rss_feed )
|
13
20
|
end
|
@@ -16,41 +23,25 @@ class RssFeedBuilder
|
|
16
23
|
@feed
|
17
24
|
end
|
18
25
|
|
19
|
-
def self.build( rss_feed )
|
20
|
-
feed = self.new( rss_feed )
|
21
|
-
feed.to_feed
|
22
|
-
end
|
23
26
|
|
24
27
|
|
25
28
|
def build_feed( rss_feed )
|
26
29
|
feed = Feed.new
|
27
|
-
## feed.object = rss_feed # not use for now
|
28
30
|
feed.format = "rss #{rss_feed.rss_version}"
|
29
31
|
|
30
|
-
feed.
|
31
|
-
feed.url = rss_feed.channel.link # required
|
32
|
-
feed.summary = rss_feed.channel.description # required
|
33
|
-
|
34
|
-
logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
|
35
|
-
|
36
|
-
# NOTE:
|
37
|
-
# All date-times in RSS conform
|
38
|
-
# to the Date and Time Specification of RFC 822
|
39
|
-
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
40
|
-
# Sat, 07 Sep 2013 00:00:01 GMT
|
32
|
+
logger.debug " rss | feed.version >#{rss_feed.rss_version}<"
|
41
33
|
|
42
|
-
|
43
|
-
|
44
|
-
feed.
|
45
|
-
feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
|
34
|
+
feed.title = handle_content( rss_feed.channel.title, 'feed.title' ) # required
|
35
|
+
feed.summary = handle_content( rss_feed.channel.description, 'feed.description => summary' ) # required
|
36
|
+
feed.url = rss_feed.channel.link # required
|
46
37
|
|
47
|
-
|
48
|
-
|
38
|
+
feed.updated = handle_date( rss_feed.channel.lastBuildDate, 'feed.lastBuildDate => updated' ) # optional
|
39
|
+
feed.published = handle_date( rss_feed.channel.pubDate, 'feed.pubDate => published' ) # optional
|
49
40
|
|
50
41
|
|
51
|
-
feed.generator = rss_feed.channel.generator
|
42
|
+
feed.generator = rss_feed.channel.generator # optional
|
52
43
|
|
53
|
-
logger.debug " rss |
|
44
|
+
logger.debug " rss | feed.generator >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
|
54
45
|
|
55
46
|
|
56
47
|
items = []
|
@@ -65,13 +56,12 @@ class RssFeedBuilder
|
|
65
56
|
def build_feed_item( rss_item )
|
66
57
|
|
67
58
|
item = Item.new
|
68
|
-
## item.object = rss_item # not use for now
|
69
59
|
|
70
|
-
item.title = rss_item.title
|
60
|
+
item.title = handle_content( rss_item.title, 'item.title' )
|
71
61
|
item.url = rss_item.link
|
72
62
|
|
73
|
-
logger.debug " rss | item.
|
74
|
-
|
63
|
+
logger.debug " rss | item.link >#{rss_item.link}< : #{rss_item.link.class.name}"
|
64
|
+
|
75
65
|
|
76
66
|
## todo:
|
77
67
|
## check if feedburner:origLink present - if yes, use it for url/link
|
@@ -81,24 +71,15 @@ class RssFeedBuilder
|
|
81
71
|
## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
|
82
72
|
|
83
73
|
|
84
|
-
item.summary = rss_item.description
|
74
|
+
item.summary = handle_content( rss_item.description, 'item.description => summary' )
|
85
75
|
|
86
76
|
# check for <content:encoded>
|
87
77
|
# -- using RSS 1.0 content module in RSS 2.0
|
88
78
|
item.content = rss_item.content_encoded
|
89
|
-
logger.debug " rss | item.content_encoded[0..40]
|
90
|
-
|
91
|
-
# NOTE:
|
92
|
-
# All date-times in RSS conform
|
93
|
-
# to the Date and Time Specification of RFC 822
|
94
|
-
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
95
|
-
# Sat, 07 Sep 2013 00:00:01 GMT
|
96
|
-
|
97
|
-
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
79
|
+
logger.debug " rss | item.content_encoded[0..40] >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
|
98
80
|
|
99
|
-
item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
|
100
81
|
|
101
|
-
|
82
|
+
item.updated = handle_date( rss_item.pubDate, 'item.pubDate => updated' )
|
102
83
|
|
103
84
|
|
104
85
|
## fix/todo: check if rss_item.guid present? !!!!
|
@@ -107,7 +88,7 @@ class RssFeedBuilder
|
|
107
88
|
|
108
89
|
if rss_item.guid && rss_item.guid.content
|
109
90
|
item.guid = rss_item.guid.content
|
110
|
-
logger.debug " rss | item.guid.content
|
91
|
+
logger.debug " rss | item.guid.content >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
|
111
92
|
else
|
112
93
|
item.guid = rss_item.link
|
113
94
|
logger.warn " rss | item.guid.content missing !!!! - using link for guid"
|
@@ -121,9 +102,56 @@ class RssFeedBuilder
|
|
121
102
|
# <category><![CDATA[Ruby]]></category>
|
122
103
|
# <category><![CDATA[Ruby on Rails]]></category>
|
123
104
|
|
124
|
-
|
125
105
|
item
|
126
106
|
end # method build_feed_item_from_rss
|
127
107
|
|
108
|
+
|
109
|
+
|
110
|
+
def handle_date( el, name )
|
111
|
+
## change time to utc if present? why? why not?
|
112
|
+
# -- .utc.strftime( "%Y-%m-%d %H:%M" )
|
113
|
+
|
114
|
+
# NOTE:
|
115
|
+
# All date-times in RSS conform
|
116
|
+
# to the Date and Time Specification of RFC 822
|
117
|
+
# e.g. Sun, 19 May 2012 15:21:36 GMT or
|
118
|
+
# Sat, 07 Sep 2013 00:00:01 GMT
|
119
|
+
|
120
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
121
|
+
|
122
|
+
logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
|
123
|
+
|
124
|
+
|
125
|
+
## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
|
126
|
+
date = if el.nil?
|
127
|
+
nil
|
128
|
+
else
|
129
|
+
el.to_datetime
|
130
|
+
end
|
131
|
+
|
132
|
+
date
|
133
|
+
end
|
134
|
+
|
135
|
+
def handle_content( el, name )
|
136
|
+
## note:
|
137
|
+
# use for feed.title, feed.description
|
138
|
+
# item.title, item.description
|
139
|
+
#
|
140
|
+
# do NOT use for others e.g. feed.generator, etc.
|
141
|
+
|
142
|
+
|
143
|
+
## todo/fix: strip html tags n attributes ???
|
144
|
+
|
145
|
+
logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
|
146
|
+
|
147
|
+
text = if el.nil?
|
148
|
+
nil
|
149
|
+
else
|
150
|
+
el.strip
|
151
|
+
end
|
152
|
+
text
|
153
|
+
end
|
154
|
+
|
155
|
+
|
128
156
|
end # class RssFeedBuilder
|
129
157
|
end # module FeedParser
|