feedparser 0.2.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 747fe0251bd3069a2c17b9ef047673386410b314
4
- data.tar.gz: 2d5e9a169f84b90e3fd5b66c4b614e5b57adc82e
3
+ metadata.gz: 8986d2c787017a536660de47dc443469b6eb2a0a
4
+ data.tar.gz: 94441f640c433a63de4f94ffcfbd993af0e4139b
5
5
  SHA512:
6
- metadata.gz: 4042d15f3212979e796b7e979acf27f8da6d19fa8105580649d7bd8239f3f73620a0b99ec7254b11f6a77f3e7db10abe7b87126f10551265c6e8574ba1b62efa
7
- data.tar.gz: 1b1545a13e22c2320020d32b3d4a7a817c6d22d692df9d5a72c70cb1cb6a383cc12d59f279f9ef5734034f17f9a8169561db8d263cfa2c56471dfdd964ecb80e
6
+ metadata.gz: 0643b97dc231542d7e0c426191afa625e1114b60c2828abd9615f339509f63d36f6c44fa463877c3932dd9ef6739295eaba24af98d2c942f382398c7e91352d0
7
+ data.tar.gz: fc838c9f2e4875ff8def0c2855b69bd1786d2391ce9298480cf9b6782915fd88c91c22f961c9f59c17b5e16fcb8b3eb14a0929263c4e2b98fe86cd1d7a26a0ab
@@ -11,8 +11,15 @@ lib/feedparser/parser.rb
11
11
  lib/feedparser/version.rb
12
12
  test/feeds/googlegroups.atom
13
13
  test/feeds/googlegroups2.atom
14
- test/feeds/quirksblog.atom.v03
14
+ test/feeds/headius.atom
15
+ test/feeds/lambdatheultimate.rss2
16
+ test/feeds/railstutorial.atom
17
+ test/feeds/rubyflow.rss2
18
+ test/feeds/rubymine.rss2
19
+ test/feeds/rubyonrails.atom
20
+ test/feeds/sitepoint.rss2
15
21
  test/helper.rb
16
22
  test/test_atom.rb
17
- test/test_atom_from_file.rb
23
+ test/test_atom_live.rb
18
24
  test/test_rss.rb
25
+ test/test_rss_live.rb
data/README.md CHANGED
@@ -17,24 +17,49 @@ Feed • Item
17
17
 
18
18
  ### `Feed` Struct
19
19
 
20
+ #### Mappings
21
+
22
+ Note: uses question mark (`?`) for optional elements (otherwise assume required elements)
23
+
24
+ **Title 'n' Summary**
25
+
26
+ Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
27
+ description (RSS 2.0) and subtitle (Atom) content and will unescape HTML entities e.g. `&` becomes & and so on - always
28
+ resulting in plain vanilla text.
29
+
30
+ | Feed Struct | RSS 2.0 | Notes | Atom | Notes |
31
+ | ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
32
+ | `feed.title` | `title` | plain vanilla text | `title` | plain vanilla text |
33
+ | `feed.summary` | `description` | plain vanilla text | `subtitle`? | plain vanilla text |
34
+
35
+
36
+ **Dates**
37
+
38
+ | Feed Struct | RSS 2.0 | Notes | Atom | Notes |
39
+ | ------------------ | ------------------- | ----------------- | ---------- | --------------- |
40
+ | `feed.updated` | `lastBuildDate`? | RFC-822 format | `updated` | ISO 801 format |
41
+ | `feed.published` | `pubDate`? | RFC-822 format | - | |
42
+
43
+ Note: Check - for RSS 2.0 set feed.updated to pubDate or lastBuildDate if only one present? if both present - map as above.
44
+
45
+
46
+ RFC-822 date format e.g. Wed, 14 Jan 2015 19:48:57 +0100
47
+
48
+ ISO-801 date format e.g. 2015-01-11T09:30:16Z
49
+
50
+
20
51
  ~~~
21
52
  class Feed
22
53
  attr_accessor :format # e.g. atom|rss 2.0|etc.
23
- attr_accessor :title
24
- attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
54
+ attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities unescaped
25
55
  attr_accessor :url
26
56
 
27
57
  attr_accessor :items
28
58
 
29
- attr_accessor :summary # e.g. description (rss)
30
- attr_accessor :summary_type # e.g. text|html|html-escaped
31
-
32
- attr_accessor :title2 # e.g. subtitle (atom)
33
- attr_accessor :title2_type # e.g. text|html|html-escaped
59
+ attr_accessor :summary # note: is description in RSS 2.0 and subtitle in Atom; always plain vanilla text
34
60
 
35
- attr_accessor :published
36
- attr_accessor :updated
37
- attr_accessor :built
61
+ attr_accessor :updated # note: is lastBuildDate in RSS 2.0
62
+ attr_accessor :published # note: is pubDate in RSS 2.0; not available in Atom
38
63
 
39
64
  attr_accessor :generator
40
65
  attr_accessor :generator_version # e.g. @version (atom)
@@ -45,20 +70,48 @@ end
45
70
 
46
71
  ### `Item` Struct
47
72
 
73
+ **Title 'n' Summary**
74
+
75
+ Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
76
+ description (RSS 2.0) and summary (Atom) content
77
+ and will unescape HTML entities e.g. `&` becomes & and so on - always
78
+ resulting in plain vanilla text.
79
+
80
+ Note: In plain vanilla RSS 2.0 there's no difference between (full) content and summary - everything is wrapped
81
+ in a description element; however, best practice is using the content "module" from RSS 1.0 inside RSS 2.0.
82
+ If there's no content module present the feed parser will "clone" the description and use one version for `item.summary` and
83
+ the clone for `item.content`.
84
+
85
+ Note: The content element will assume html content.
86
+
87
+ | Feed Struct | RSS 2.0 | Notes | Atom | Notes |
88
+ | ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
89
+ | `item.title` | `title` | plain vanilla text | `title` | plain vanilla text |
90
+ | `item.summary` | `description` | plain vanilla text | `summary`? | plain vanilla text |
91
+ | `item.content` | `content`? | html | `content`? | html |
92
+
93
+
94
+ **Dates**
95
+
96
+ | Item Struct | RSS 2.0 | Notes | Atom | Notes |
97
+ | ------------------ | ------------------- | ----------------- | ------------- | --------------- |
98
+ | `item.updated` | `pubDate`? | RFC-822 format | `updated` | ISO 801 format |
99
+ | `item.published` | - | RFC-822 format | `published`? | ISO 801 format |
100
+
101
+ Note: In plain vanilla RSS 2.0 there's only one `pubDate` for items, thus, it's not possible to differeniate between published and updated dates for items; note - the `item.pubDate` will get mapped to `item.updated`. To set the published date in RSS 2.0 use the dublin core module e.g `dc:created`, for example.
102
+
48
103
  ~~~
49
104
  class Item
50
- attr_accessor :title
51
- attr_accessor :title_type # optional for now (text|html|html-escaped) - not yet set
52
- attr_accessor :url # todo: rename to link (use alias) ??
105
+ attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities
106
+ attr_accessor :url
53
107
 
54
108
  attr_accessor :content
55
109
  attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
56
110
 
57
111
  attr_accessor :summary
58
- attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
59
112
 
60
- attr_accessor :published
61
- attr_accessor :updated
113
+ attr_accessor :updated # note: is pubDate in RSS 2.0 and updated in Atom
114
+ attr_accessor :published # note: is published in Atom; not available in RSS 2.0 (use dc:created ??)
62
115
 
63
116
  attr_accessor :guid # todo: rename to id (use alias) ??
64
117
  end
@@ -78,17 +131,6 @@ pp feed
78
131
  ~~~
79
132
 
80
133
 
81
-
82
- ## Alternatives
83
-
84
- - [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
85
- - [`simple-rss`](http://rubyforge.org/projects/simple-rss)
86
- - [`feedtools`](http://rubyforge.org/projects/feedtools)
87
-
88
- TBD
89
-
90
-
91
-
92
134
  ## Install
93
135
 
94
136
  Just install the gem:
data/Rakefile CHANGED
@@ -18,7 +18,8 @@ Hoe.spec 'feedparser' do
18
18
  self.history_file = 'HISTORY.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['logutils', '>= 0.6.1']
21
+ ['logutils', '>=0.6.1'],
22
+ ['textutils', '>=1.0.0'],
22
23
  ]
23
24
 
24
25
  ### todo: add fetcher dep for testing (e.g. development only)
@@ -1,12 +1,18 @@
1
+ # encoding: utf-8
2
+
3
+
1
4
  # core and stdlibs
2
5
 
3
6
  require 'rss'
4
7
  require 'pp'
5
- require 'date'
8
+ require 'time' # note: ruby has a builtin core time class and a stdlib time class pack; require stdlib extensions
9
+ require 'date' # note: ruby has a builtin core date class and a stdlib date class pack; require stdlib extensions
6
10
 
7
11
  # 3rd party gems/libs
8
12
 
9
13
  require 'logutils'
14
+ require 'textutils'
15
+
10
16
 
11
17
  # our own code
12
18
 
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
 
2
3
  module FeedParser
3
4
 
@@ -5,6 +6,12 @@ class AtomFeedBuilder
5
6
 
6
7
  include LogUtils::Logging
7
8
 
9
+
10
+ def self.build( atom_feed )
11
+ feed = self.new( atom_feed )
12
+ feed.to_feed
13
+ end
14
+
8
15
  def initialize( atom_feed )
9
16
  @feed = build_feed( atom_feed )
10
17
  end
@@ -13,28 +20,21 @@ class AtomFeedBuilder
13
20
  @feed
14
21
  end
15
22
 
16
- def self.build( atom_feed )
17
- feed = self.new( atom_feed )
18
- feed.to_feed
19
- end
20
23
 
21
24
 
22
25
  def build_feed( atom_feed )
23
26
  feed = Feed.new
24
- ## feed.object = atom_feed # not use for now
25
27
  feed.format = 'atom'
26
28
 
27
- feed.title = atom_feed.title.content
28
- logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
29
-
29
+ feed.title = handle_content( atom_feed.title, 'feed.title' )
30
30
 
31
- logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
31
+ logger.debug " atom | feed.id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
32
32
 
33
33
  feed.url = nil
34
34
 
35
35
  ## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
36
36
  atom_feed.links.each_with_index do |link,i|
37
- logger.debug " atom | link[#{i+1}] link rel=>#{link.rel}< : #{link.rel.class.name} type=#{link.type} href=#{link.href}"
37
+ logger.debug " atom | feed.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
38
38
 
39
39
  ## for now assume alternate is link or no rel specified (assumes alternate)
40
40
  ## note: only set if feed.url is NOT already set (via <id> for example)
@@ -43,7 +43,11 @@ class AtomFeedBuilder
43
43
  end
44
44
  end
45
45
 
46
- ## note: as fallback try id if still no url found
46
+ if feed.url.nil?
47
+ ### todo/fix: issue warning - no link found!!!!
48
+ end
49
+
50
+ ## note: as fallback try id if still no url found - why?? why not??
47
51
  ## use url only if starts_with http
48
52
  ## might not be link e.g blogger uses for ids =>
49
53
  ## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
@@ -58,30 +62,23 @@ class AtomFeedBuilder
58
62
 
59
63
 
60
64
  if atom_feed.updated
61
- # NOTE: empty updated.content e.g. used by google groups feed
62
- # will return nil : NilClass
63
-
64
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
65
-
66
- feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
67
- logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
65
+ feed.updated = handle_date( atom_feed.updated, 'feed.updated' )
68
66
  end
69
67
 
70
68
  if atom_feed.generator
71
69
  ## Note: remove (strip) leading and trailing spaces and newlines
72
70
  feed.generator = atom_feed.generator.content.strip
73
- logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
71
+ logger.debug " atom | feed.generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
74
72
 
75
73
  # pp atom_feed.generator
76
74
  feed.generator_version = atom_feed.generator.version
77
75
  feed.generator_uri = atom_feed.generator.uri
78
- logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
79
- logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
76
+ logger.debug " atom | feed.generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
77
+ logger.debug " atom | feed.generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
80
78
  end
81
79
 
82
80
  if atom_feed.subtitle
83
- feed.title2 = atom_feed.subtitle.content
84
- logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
81
+ feed.summary = handle_content( atom_feed.subtitle, 'feed.subtitle => summary' )
85
82
  end
86
83
 
87
84
 
@@ -94,48 +91,101 @@ class AtomFeedBuilder
94
91
  feed # return new feed
95
92
  end # method build_feed_from_atom
96
93
 
94
+
97
95
  def build_feed_item( atom_item )
98
96
  item = Item.new # Item.new
99
- ## item.object = atom_item # not used for now
100
97
 
101
- item.title = atom_item.title.content
102
- item.url = atom_item.link.href
103
-
104
- logger.debug " atom | item.title.content: >#{atom_item.title.content}< : #{atom_item.title.content.class.name}"
105
- logger.debug " atom | item.link.href: >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
98
+ item.title = handle_content( atom_item.title, 'item.title' )
99
+
100
+ ## Note: item might have many links
101
+ ## e.g. see blogger (headius)
102
+ ## <link rel='replies' type='application/atom+xml' href='http://blog.headius.com/feeds/3430080308857860963/comments/default' title='Post Comments'/>
103
+ ## <link rel='replies' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html#comment-form' title='0 Comments'/>
104
+ ## <link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
105
+ ## <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
106
+ ## <link rel='alternate' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html'
107
+
108
+ item.url = nil
109
+
110
+ if atom_item.links.size == 1
111
+ item.url = atom_item.link.href
112
+ logger.debug " atom | item.link.href >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
113
+ else
114
+ ## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
115
+ atom_item.links.each_with_index do |link,i|
116
+ logger.debug " atom | item.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
117
+ ## for now assume alternate is link or no rel specified (assumes alternate)
118
+ ## note: only set if feed.url is NOT already set (via <id> for example)
119
+ if item.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
120
+ item.url = link.href
121
+ end
122
+ end
123
+ end
106
124
 
107
125
 
108
126
  if atom_item.updated
109
- ## change time to utc if present? why? why not?
110
- # -- .utc.strftime( "%Y-%m-%d %H:%M" )
111
-
112
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
113
-
114
- item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
115
- logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
127
+ item.updated = handle_date( atom_item.updated, 'item.updated' )
116
128
  end
117
129
 
118
130
  if atom_item.published
119
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
120
-
121
- item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
122
- logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
131
+ item.published = handle_date( atom_item.published, 'item.published' )
123
132
  end
124
133
 
125
134
 
126
135
  item.guid = atom_item.id.content
127
- logger.debug " atom | item.id.content: >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
136
+ logger.debug " atom | item.id.content >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
128
137
 
129
138
  if atom_item.content
130
139
  item.content = atom_item.content.content
131
140
  end
132
141
 
133
142
  if atom_item.summary
134
- item.summary = atom_item.summary.content
143
+ item.summary = handle_content( atom_item.summary, 'item.summary' )
135
144
  end
136
145
 
137
146
  item
138
147
  end # method build_feed_item
139
148
 
149
+
150
+
151
+ def handle_date( el, name )
152
+ ## change time to utc if present? why? why not?
153
+ # -- .utc.strftime( "%Y-%m-%d %H:%M" )
154
+
155
+ ###############
156
+ # examples:
157
+ # 2015-01-02 01:56:06 +0100
158
+
159
+ logger.debug " atom | #{name}.content >#{el.content}< : #{el.content.class.name}"
160
+
161
+ # NOTE: empty updated.content possible e.g. used by google groups feed (e.g. <updated></updated>)
162
+ # will return nil : NilClass
163
+
164
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
165
+ date = if el.content.nil?
166
+ nil
167
+ else
168
+ el.content.to_datetime
169
+ end
170
+
171
+ date
172
+ end
173
+
174
+
175
+ def handle_content( el, name ) ## rename to handle_plain_vanilla_text_content - why? why not?
176
+ ### todo/fix: if type html ?? strip html tags n attributes
177
+ ## always strip html tags n attributes?? why? why not?
178
+
179
+ ## check if content.nil? possible e.g. <title></title> => empty string or nil?
180
+
181
+ ## note: dump head (first 30 chars)
182
+ logger.debug " atom | #{name}.content[0..30] (type=>#{el.type}<) >#{el.content[0..30]}< : #{el.content.class.name}"
183
+
184
+ ## note: always strip leading and trailing whitespaces (spaces/tabs/newlines)
185
+ text = el.content.strip
186
+ text
187
+ end
188
+
189
+
140
190
  end # AtomFeedBuilder
141
191
  end # FeedParser
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
 
2
3
  module FeedParser
3
4
 
@@ -8,6 +9,12 @@ class RssFeedBuilder
8
9
 
9
10
  include LogUtils::Logging
10
11
 
12
+
13
+ def self.build( rss_feed )
14
+ feed = self.new( rss_feed )
15
+ feed.to_feed
16
+ end
17
+
11
18
  def initialize( rss_feed )
12
19
  @feed = build_feed( rss_feed )
13
20
  end
@@ -16,41 +23,25 @@ class RssFeedBuilder
16
23
  @feed
17
24
  end
18
25
 
19
- def self.build( rss_feed )
20
- feed = self.new( rss_feed )
21
- feed.to_feed
22
- end
23
26
 
24
27
 
25
28
  def build_feed( rss_feed )
26
29
  feed = Feed.new
27
- ## feed.object = rss_feed # not use for now
28
30
  feed.format = "rss #{rss_feed.rss_version}"
29
31
 
30
- feed.title = rss_feed.channel.title # required
31
- feed.url = rss_feed.channel.link # required
32
- feed.summary = rss_feed.channel.description # required
33
-
34
- logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
35
-
36
- # NOTE:
37
- # All date-times in RSS conform
38
- # to the Date and Time Specification of RFC 822
39
- # e.g. Sun, 19 May 2012 15:21:36 GMT or
40
- # Sat, 07 Sep 2013 00:00:01 GMT
32
+ logger.debug " rss | feed.version >#{rss_feed.rss_version}<"
41
33
 
42
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
43
-
44
- feed.built = rss_feed.channel.lastBuildDate.nil? ? nil : rss_feed.channel.lastBuildDate.to_datetime # optional
45
- feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
34
+ feed.title = handle_content( rss_feed.channel.title, 'feed.title' ) # required
35
+ feed.summary = handle_content( rss_feed.channel.description, 'feed.description => summary' ) # required
36
+ feed.url = rss_feed.channel.link # required
46
37
 
47
- logger.debug " rss | channel.lastBuildDate: >#{rss_feed.channel.lastBuildDate}< : #{rss_feed.channel.lastBuildDate.class.name}"
48
- logger.debug " rss | channel.pubDate: >#{rss_feed.channel.pubDate}< : #{rss_feed.channel.pubDate.class.name}"
38
+ feed.updated = handle_date( rss_feed.channel.lastBuildDate, 'feed.lastBuildDate => updated' ) # optional
39
+ feed.published = handle_date( rss_feed.channel.pubDate, 'feed.pubDate => published' ) # optional
49
40
 
50
41
 
51
- feed.generator = rss_feed.channel.generator # optional
42
+ feed.generator = rss_feed.channel.generator # optional
52
43
 
53
- logger.debug " rss | channel.generator: >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
44
+ logger.debug " rss | feed.generator >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
54
45
 
55
46
 
56
47
  items = []
@@ -65,13 +56,12 @@ class RssFeedBuilder
65
56
  def build_feed_item( rss_item )
66
57
 
67
58
  item = Item.new
68
- ## item.object = rss_item # not use for now
69
59
 
70
- item.title = rss_item.title
60
+ item.title = handle_content( rss_item.title, 'item.title' )
71
61
  item.url = rss_item.link
72
62
 
73
- logger.debug " rss | item.title: >#{rss_item.title}< : #{rss_item.title.class.name}"
74
- logger.debug " rss | item.link: >#{rss_item.link}< : #{rss_item.link.class.name}"
63
+ logger.debug " rss | item.link >#{rss_item.link}< : #{rss_item.link.class.name}"
64
+
75
65
 
76
66
  ## todo:
77
67
  ## check if feedburner:origLink present - if yes, use it for url/link
@@ -81,24 +71,15 @@ class RssFeedBuilder
81
71
  ## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
82
72
 
83
73
 
84
- item.summary = rss_item.description
74
+ item.summary = handle_content( rss_item.description, 'item.description => summary' )
85
75
 
86
76
  # check for <content:encoded>
87
77
  # -- using RSS 1.0 content module in RSS 2.0
88
78
  item.content = rss_item.content_encoded
89
- logger.debug " rss | item.content_encoded[0..40]: >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
90
-
91
- # NOTE:
92
- # All date-times in RSS conform
93
- # to the Date and Time Specification of RFC 822
94
- # e.g. Sun, 19 May 2012 15:21:36 GMT or
95
- # Sat, 07 Sep 2013 00:00:01 GMT
96
-
97
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
79
+ logger.debug " rss | item.content_encoded[0..40] >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
98
80
 
99
- item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
100
81
 
101
- logger.debug " rss | item.pubDate: >#{rss_item.pubDate}< : #{rss_item.pubDate.class.name}"
82
+ item.updated = handle_date( rss_item.pubDate, 'item.pubDate => updated' )
102
83
 
103
84
 
104
85
  ## fix/todo: check if rss_item.guid present? !!!!
@@ -107,7 +88,7 @@ class RssFeedBuilder
107
88
 
108
89
  if rss_item.guid && rss_item.guid.content
109
90
  item.guid = rss_item.guid.content
110
- logger.debug " rss | item.guid.content: >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
91
+ logger.debug " rss | item.guid.content >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
111
92
  else
112
93
  item.guid = rss_item.link
113
94
  logger.warn " rss | item.guid.content missing !!!! - using link for guid"
@@ -121,9 +102,56 @@ class RssFeedBuilder
121
102
  # <category><![CDATA[Ruby]]></category>
122
103
  # <category><![CDATA[Ruby on Rails]]></category>
123
104
 
124
-
125
105
  item
126
106
  end # method build_feed_item_from_rss
127
107
 
108
+
109
+
110
+ def handle_date( el, name )
111
+ ## change time to utc if present? why? why not?
112
+ # -- .utc.strftime( "%Y-%m-%d %H:%M" )
113
+
114
+ # NOTE:
115
+ # All date-times in RSS conform
116
+ # to the Date and Time Specification of RFC 822
117
+ # e.g. Sun, 19 May 2012 15:21:36 GMT or
118
+ # Sat, 07 Sep 2013 00:00:01 GMT
119
+
120
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
121
+
122
+ logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
123
+
124
+
125
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
126
+ date = if el.nil?
127
+ nil
128
+ else
129
+ el.to_datetime
130
+ end
131
+
132
+ date
133
+ end
134
+
135
+ def handle_content( el, name )
136
+ ## note:
137
+ # use for feed.title, feed.description
138
+ # item.title, item.description
139
+ #
140
+ # do NOT use for others e.g. feed.generator, etc.
141
+
142
+
143
+ ## todo/fix: strip html tags n attributes ???
144
+
145
+ logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
146
+
147
+ text = if el.nil?
148
+ nil
149
+ else
150
+ el.strip
151
+ end
152
+ text
153
+ end
154
+
155
+
128
156
  end # class RssFeedBuilder
129
157
  end # module FeedParser