feedparser 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 747fe0251bd3069a2c17b9ef047673386410b314
4
- data.tar.gz: 2d5e9a169f84b90e3fd5b66c4b614e5b57adc82e
3
+ metadata.gz: 8986d2c787017a536660de47dc443469b6eb2a0a
4
+ data.tar.gz: 94441f640c433a63de4f94ffcfbd993af0e4139b
5
5
  SHA512:
6
- metadata.gz: 4042d15f3212979e796b7e979acf27f8da6d19fa8105580649d7bd8239f3f73620a0b99ec7254b11f6a77f3e7db10abe7b87126f10551265c6e8574ba1b62efa
7
- data.tar.gz: 1b1545a13e22c2320020d32b3d4a7a817c6d22d692df9d5a72c70cb1cb6a383cc12d59f279f9ef5734034f17f9a8169561db8d263cfa2c56471dfdd964ecb80e
6
+ metadata.gz: 0643b97dc231542d7e0c426191afa625e1114b60c2828abd9615f339509f63d36f6c44fa463877c3932dd9ef6739295eaba24af98d2c942f382398c7e91352d0
7
+ data.tar.gz: fc838c9f2e4875ff8def0c2855b69bd1786d2391ce9298480cf9b6782915fd88c91c22f961c9f59c17b5e16fcb8b3eb14a0929263c4e2b98fe86cd1d7a26a0ab
@@ -11,8 +11,15 @@ lib/feedparser/parser.rb
11
11
  lib/feedparser/version.rb
12
12
  test/feeds/googlegroups.atom
13
13
  test/feeds/googlegroups2.atom
14
- test/feeds/quirksblog.atom.v03
14
+ test/feeds/headius.atom
15
+ test/feeds/lambdatheultimate.rss2
16
+ test/feeds/railstutorial.atom
17
+ test/feeds/rubyflow.rss2
18
+ test/feeds/rubymine.rss2
19
+ test/feeds/rubyonrails.atom
20
+ test/feeds/sitepoint.rss2
15
21
  test/helper.rb
16
22
  test/test_atom.rb
17
- test/test_atom_from_file.rb
23
+ test/test_atom_live.rb
18
24
  test/test_rss.rb
25
+ test/test_rss_live.rb
data/README.md CHANGED
@@ -17,24 +17,49 @@ Feed • Item
17
17
 
18
18
  ### `Feed` Struct
19
19
 
20
+ #### Mappings
21
+
22
+ Note: uses question mark (`?`) for optional elements (otherwise assume required elements)
23
+
24
+ **Title 'n' Summary**
25
+
26
+ Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
27
+ description (RSS 2.0) and subtitle (Atom) content and will unescape HTML entities e.g. `&` becomes & and so on - always
28
+ resulting in plain vanilla text.
29
+
30
+ | Feed Struct | RSS 2.0 | Notes | Atom | Notes |
31
+ | ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
32
+ | `feed.title` | `title` | plain vanilla text | `title` | plain vanilla text |
33
+ | `feed.summary` | `description` | plain vanilla text | `subtitle`? | plain vanilla text |
34
+
35
+
36
+ **Dates**
37
+
38
+ | Feed Struct | RSS 2.0 | Notes | Atom | Notes |
39
+ | ------------------ | ------------------- | ----------------- | ---------- | --------------- |
40
+ | `feed.updated` | `lastBuildDate`? | RFC-822 format | `updated` | ISO 801 format |
41
+ | `feed.published` | `pubDate`? | RFC-822 format | - | |
42
+
43
+ Note: Check - for RSS 2.0 set feed.updated to pubDate or lastBuildDate if only one present? if both present - map as above.
44
+
45
+
46
+ RFC-822 date format e.g. Wed, 14 Jan 2015 19:48:57 +0100
47
+
48
+ ISO-801 date format e.g. 2015-01-11T09:30:16Z
49
+
50
+
20
51
  ~~~
21
52
  class Feed
22
53
  attr_accessor :format # e.g. atom|rss 2.0|etc.
23
- attr_accessor :title
24
- attr_accessor :title_type # e.g. text|html|html-escaped (optional) -use - why?? why not??
54
+ attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities unescaped
25
55
  attr_accessor :url
26
56
 
27
57
  attr_accessor :items
28
58
 
29
- attr_accessor :summary # e.g. description (rss)
30
- attr_accessor :summary_type # e.g. text|html|html-escaped
31
-
32
- attr_accessor :title2 # e.g. subtitle (atom)
33
- attr_accessor :title2_type # e.g. text|html|html-escaped
59
+ attr_accessor :summary # note: is description in RSS 2.0 and subtitle in Atom; always plain vanilla text
34
60
 
35
- attr_accessor :published
36
- attr_accessor :updated
37
- attr_accessor :built
61
+ attr_accessor :updated # note: is lastBuildDate in RSS 2.0
62
+ attr_accessor :published # note: is pubDate in RSS 2.0; not available in Atom
38
63
 
39
64
  attr_accessor :generator
40
65
  attr_accessor :generator_version # e.g. @version (atom)
@@ -45,20 +70,48 @@ end
45
70
 
46
71
  ### `Item` Struct
47
72
 
73
+ **Title 'n' Summary**
74
+
75
+ Note: The Feed parser will remove all html tags and attributes from the title (RSS 2.0+Atom),
76
+ description (RSS 2.0) and summary (Atom) content
77
+ and will unescape HTML entities e.g. `&` becomes & and so on - always
78
+ resulting in plain vanilla text.
79
+
80
+ Note: In plain vanilla RSS 2.0 there's no difference between (full) content and summary - everything is wrapped
81
+ in a description element; however, best practice is using the content "module" from RSS 1.0 inside RSS 2.0.
82
+ If there's no content module present the feed parser will "clone" the description and use one version for `item.summary` and
83
+ the clone for `item.content`.
84
+
85
+ Note: The content element will assume html content.
86
+
87
+ | Feed Struct | RSS 2.0 | Notes | Atom | Notes |
88
+ | ------------------ | ----------------- | ------------------- | ------------- | ------------------- |
89
+ | `item.title` | `title` | plain vanilla text | `title` | plain vanilla text |
90
+ | `item.summary` | `description` | plain vanilla text | `summary`? | plain vanilla text |
91
+ | `item.content` | `content`? | html | `content`? | html |
92
+
93
+
94
+ **Dates**
95
+
96
+ | Item Struct | RSS 2.0 | Notes | Atom | Notes |
97
+ | ------------------ | ------------------- | ----------------- | ------------- | --------------- |
98
+ | `item.updated` | `pubDate`? | RFC-822 format | `updated` | ISO 801 format |
99
+ | `item.published` | - | RFC-822 format | `published`? | ISO 801 format |
100
+
101
+ Note: In plain vanilla RSS 2.0 there's only one `pubDate` for items, thus, it's not possible to differeniate between published and updated dates for items; note - the `item.pubDate` will get mapped to `item.updated`. To set the published date in RSS 2.0 use the dublin core module e.g `dc:created`, for example.
102
+
48
103
  ~~~
49
104
  class Item
50
- attr_accessor :title
51
- attr_accessor :title_type # optional for now (text|html|html-escaped) - not yet set
52
- attr_accessor :url # todo: rename to link (use alias) ??
105
+ attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities
106
+ attr_accessor :url
53
107
 
54
108
  attr_accessor :content
55
109
  attr_accessor :content_type # optional for now (text|html|html-escaped|binary-base64) - not yet set
56
110
 
57
111
  attr_accessor :summary
58
- attr_accessor :summary_type # optional for now (text|html|html-escaped) - not yet set
59
112
 
60
- attr_accessor :published
61
- attr_accessor :updated
113
+ attr_accessor :updated # note: is pubDate in RSS 2.0 and updated in Atom
114
+ attr_accessor :published # note: is published in Atom; not available in RSS 2.0 (use dc:created ??)
62
115
 
63
116
  attr_accessor :guid # todo: rename to id (use alias) ??
64
117
  end
@@ -78,17 +131,6 @@ pp feed
78
131
  ~~~
79
132
 
80
133
 
81
-
82
- ## Alternatives
83
-
84
- - [`syndication`](http://syndication.rubyforge.org) [(Source)](https://github.com/lpar/syndication) - by Mathew (aka lpar); RSS 1.0, 2.0, Atom, and understands namespaces; optional support for Dublin Core, iTunes/podcast feeds, and RSS 1.0 Syndication and Content modules
85
- - [`simple-rss`](http://rubyforge.org/projects/simple-rss)
86
- - [`feedtools`](http://rubyforge.org/projects/feedtools)
87
-
88
- TBD
89
-
90
-
91
-
92
134
  ## Install
93
135
 
94
136
  Just install the gem:
data/Rakefile CHANGED
@@ -18,7 +18,8 @@ Hoe.spec 'feedparser' do
18
18
  self.history_file = 'HISTORY.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['logutils', '>= 0.6.1']
21
+ ['logutils', '>=0.6.1'],
22
+ ['textutils', '>=1.0.0'],
22
23
  ]
23
24
 
24
25
  ### todo: add fetcher dep for testing (e.g. development only)
@@ -1,12 +1,18 @@
1
+ # encoding: utf-8
2
+
3
+
1
4
  # core and stdlibs
2
5
 
3
6
  require 'rss'
4
7
  require 'pp'
5
- require 'date'
8
+ require 'time' # note: ruby has a builtin core time class and a stdlib time class pack; require stdlib extensions
9
+ require 'date' # note: ruby has a builtin core date class and a stdlib date class pack; require stdlib extensions
6
10
 
7
11
  # 3rd party gems/libs
8
12
 
9
13
  require 'logutils'
14
+ require 'textutils'
15
+
10
16
 
11
17
  # our own code
12
18
 
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
 
2
3
  module FeedParser
3
4
 
@@ -5,6 +6,12 @@ class AtomFeedBuilder
5
6
 
6
7
  include LogUtils::Logging
7
8
 
9
+
10
+ def self.build( atom_feed )
11
+ feed = self.new( atom_feed )
12
+ feed.to_feed
13
+ end
14
+
8
15
  def initialize( atom_feed )
9
16
  @feed = build_feed( atom_feed )
10
17
  end
@@ -13,28 +20,21 @@ class AtomFeedBuilder
13
20
  @feed
14
21
  end
15
22
 
16
- def self.build( atom_feed )
17
- feed = self.new( atom_feed )
18
- feed.to_feed
19
- end
20
23
 
21
24
 
22
25
  def build_feed( atom_feed )
23
26
  feed = Feed.new
24
- ## feed.object = atom_feed # not use for now
25
27
  feed.format = 'atom'
26
28
 
27
- feed.title = atom_feed.title.content
28
- logger.debug " atom | title.content >#{atom_feed.title.content}< : #{atom_feed.title.content.class.name}"
29
-
29
+ feed.title = handle_content( atom_feed.title, 'feed.title' )
30
30
 
31
- logger.debug " atom | id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
31
+ logger.debug " atom | feed.id.content >#{atom_feed.id.content}< : #{atom_feed.id.content.class.name}"
32
32
 
33
33
  feed.url = nil
34
34
 
35
35
  ## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
36
36
  atom_feed.links.each_with_index do |link,i|
37
- logger.debug " atom | link[#{i+1}] link rel=>#{link.rel}< : #{link.rel.class.name} type=#{link.type} href=#{link.href}"
37
+ logger.debug " atom | feed.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
38
38
 
39
39
  ## for now assume alternate is link or no rel specified (assumes alternate)
40
40
  ## note: only set if feed.url is NOT already set (via <id> for example)
@@ -43,7 +43,11 @@ class AtomFeedBuilder
43
43
  end
44
44
  end
45
45
 
46
- ## note: as fallback try id if still no url found
46
+ if feed.url.nil?
47
+ ### todo/fix: issue warning - no link found!!!!
48
+ end
49
+
50
+ ## note: as fallback try id if still no url found - why?? why not??
47
51
  ## use url only if starts_with http
48
52
  ## might not be link e.g blogger uses for ids =>
49
53
  ## <id>tag:blogger.com,1999:blog-4704664917418794835</id>
@@ -58,30 +62,23 @@ class AtomFeedBuilder
58
62
 
59
63
 
60
64
  if atom_feed.updated
61
- # NOTE: empty updated.content e.g. used by google groups feed
62
- # will return nil : NilClass
63
-
64
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
65
-
66
- feed.updated = atom_feed.updated.content.nil? ? nil : atom_feed.updated.content.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
67
- logger.debug " atom | updated.content >#{atom_feed.updated.content}< : #{atom_feed.updated.content.class.name}"
65
+ feed.updated = handle_date( atom_feed.updated, 'feed.updated' )
68
66
  end
69
67
 
70
68
  if atom_feed.generator
71
69
  ## Note: remove (strip) leading and trailing spaces and newlines
72
70
  feed.generator = atom_feed.generator.content.strip
73
- logger.debug " atom | generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
71
+ logger.debug " atom | feed.generator.content >#{atom_feed.generator.content}< : #{atom_feed.generator.content.class.name}"
74
72
 
75
73
  # pp atom_feed.generator
76
74
  feed.generator_version = atom_feed.generator.version
77
75
  feed.generator_uri = atom_feed.generator.uri
78
- logger.debug " atom | generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
79
- logger.debug " atom | generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
76
+ logger.debug " atom | feed.generator.version >#{atom_feed.generator.version}< : #{atom_feed.generator.version.class.name}"
77
+ logger.debug " atom | feed.generator.uri >#{atom_feed.generator.uri}< : #{atom_feed.generator.uri.class.name}"
80
78
  end
81
79
 
82
80
  if atom_feed.subtitle
83
- feed.title2 = atom_feed.subtitle.content
84
- logger.debug " atom | subtitle.content >#{atom_feed.subtitle.content}< : #{atom_feed.subtitle.content.class.name}"
81
+ feed.summary = handle_content( atom_feed.subtitle, 'feed.subtitle => summary' )
85
82
  end
86
83
 
87
84
 
@@ -94,48 +91,101 @@ class AtomFeedBuilder
94
91
  feed # return new feed
95
92
  end # method build_feed_from_atom
96
93
 
94
+
97
95
  def build_feed_item( atom_item )
98
96
  item = Item.new # Item.new
99
- ## item.object = atom_item # not used for now
100
97
 
101
- item.title = atom_item.title.content
102
- item.url = atom_item.link.href
103
-
104
- logger.debug " atom | item.title.content: >#{atom_item.title.content}< : #{atom_item.title.content.class.name}"
105
- logger.debug " atom | item.link.href: >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
98
+ item.title = handle_content( atom_item.title, 'item.title' )
99
+
100
+ ## Note: item might have many links
101
+ ## e.g. see blogger (headius)
102
+ ## <link rel='replies' type='application/atom+xml' href='http://blog.headius.com/feeds/3430080308857860963/comments/default' title='Post Comments'/>
103
+ ## <link rel='replies' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html#comment-form' title='0 Comments'/>
104
+ ## <link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
105
+ ## <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/4704664917418794835/posts/default/3430080308857860963'/>
106
+ ## <link rel='alternate' type='text/html' href='http://blog.headius.com/2014/05/jrubyconfeu-2014.html'
107
+
108
+ item.url = nil
109
+
110
+ if atom_item.links.size == 1
111
+ item.url = atom_item.link.href
112
+ logger.debug " atom | item.link.href >#{atom_item.link.href}< : #{atom_item.link.href.class.name}"
113
+ else
114
+ ## note: use links (plural to allow multiple links e.g. self,alternate,etc.)
115
+ atom_item.links.each_with_index do |link,i|
116
+ logger.debug " atom | item.link[#{i+1}] rel=>#{link.rel}< : #{link.rel.class.name} type=>#{link.type}< href=>#{link.href}<"
117
+ ## for now assume alternate is link or no rel specified (assumes alternate)
118
+ ## note: only set if feed.url is NOT already set (via <id> for example)
119
+ if item.url.nil? && (link.rel == 'alternate' || link.rel.nil?)
120
+ item.url = link.href
121
+ end
122
+ end
123
+ end
106
124
 
107
125
 
108
126
  if atom_item.updated
109
- ## change time to utc if present? why? why not?
110
- # -- .utc.strftime( "%Y-%m-%d %H:%M" )
111
-
112
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
113
-
114
- item.updated = atom_item.updated.content.nil? ? nil : atom_item.updated.content.to_datetime
115
- logger.debug " atom | item.updated.content >#{atom_item.updated.content}< : #{atom_item.updated.content.class.name}"
127
+ item.updated = handle_date( atom_item.updated, 'item.updated' )
116
128
  end
117
129
 
118
130
  if atom_item.published
119
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
120
-
121
- item.published = atom_item.published.content.nil? ? nil : atom_item.published.content.to_datetime
122
- logger.debug " atom | item.published.content >#{atom_item.published.content}< : #{atom_item.published.content.class.name}"
131
+ item.published = handle_date( atom_item.published, 'item.published' )
123
132
  end
124
133
 
125
134
 
126
135
  item.guid = atom_item.id.content
127
- logger.debug " atom | item.id.content: >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
136
+ logger.debug " atom | item.id.content >#{atom_item.id.content}< : #{atom_item.id.content.class.name}"
128
137
 
129
138
  if atom_item.content
130
139
  item.content = atom_item.content.content
131
140
  end
132
141
 
133
142
  if atom_item.summary
134
- item.summary = atom_item.summary.content
143
+ item.summary = handle_content( atom_item.summary, 'item.summary' )
135
144
  end
136
145
 
137
146
  item
138
147
  end # method build_feed_item
139
148
 
149
+
150
+
151
+ def handle_date( el, name )
152
+ ## change time to utc if present? why? why not?
153
+ # -- .utc.strftime( "%Y-%m-%d %H:%M" )
154
+
155
+ ###############
156
+ # examples:
157
+ # 2015-01-02 01:56:06 +0100
158
+
159
+ logger.debug " atom | #{name}.content >#{el.content}< : #{el.content.class.name}"
160
+
161
+ # NOTE: empty updated.content possible e.g. used by google groups feed (e.g. <updated></updated>)
162
+ # will return nil : NilClass
163
+
164
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
165
+ date = if el.content.nil?
166
+ nil
167
+ else
168
+ el.content.to_datetime
169
+ end
170
+
171
+ date
172
+ end
173
+
174
+
175
+ def handle_content( el, name ) ## rename to handle_plain_vanilla_text_content - why? why not?
176
+ ### todo/fix: if type html ?? strip html tags n attributes
177
+ ## always strip html tags n attributes?? why? why not?
178
+
179
+ ## check if content.nil? possible e.g. <title></title> => empty string or nil?
180
+
181
+ ## note: dump head (first 30 chars)
182
+ logger.debug " atom | #{name}.content[0..30] (type=>#{el.type}<) >#{el.content[0..30]}< : #{el.content.class.name}"
183
+
184
+ ## note: always strip leading and trailing whitespaces (spaces/tabs/newlines)
185
+ text = el.content.strip
186
+ text
187
+ end
188
+
189
+
140
190
  end # AtomFeedBuilder
141
191
  end # FeedParser
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
 
2
3
  module FeedParser
3
4
 
@@ -8,6 +9,12 @@ class RssFeedBuilder
8
9
 
9
10
  include LogUtils::Logging
10
11
 
12
+
13
+ def self.build( rss_feed )
14
+ feed = self.new( rss_feed )
15
+ feed.to_feed
16
+ end
17
+
11
18
  def initialize( rss_feed )
12
19
  @feed = build_feed( rss_feed )
13
20
  end
@@ -16,41 +23,25 @@ class RssFeedBuilder
16
23
  @feed
17
24
  end
18
25
 
19
- def self.build( rss_feed )
20
- feed = self.new( rss_feed )
21
- feed.to_feed
22
- end
23
26
 
24
27
 
25
28
  def build_feed( rss_feed )
26
29
  feed = Feed.new
27
- ## feed.object = rss_feed # not use for now
28
30
  feed.format = "rss #{rss_feed.rss_version}"
29
31
 
30
- feed.title = rss_feed.channel.title # required
31
- feed.url = rss_feed.channel.link # required
32
- feed.summary = rss_feed.channel.description # required
33
-
34
- logger.debug " rss | channel.description: >#{rss_feed.channel.description}< : #{rss_feed.channel.description.class.name}"
35
-
36
- # NOTE:
37
- # All date-times in RSS conform
38
- # to the Date and Time Specification of RFC 822
39
- # e.g. Sun, 19 May 2012 15:21:36 GMT or
40
- # Sat, 07 Sep 2013 00:00:01 GMT
32
+ logger.debug " rss | feed.version >#{rss_feed.rss_version}<"
41
33
 
42
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
43
-
44
- feed.built = rss_feed.channel.lastBuildDate.nil? ? nil : rss_feed.channel.lastBuildDate.to_datetime # optional
45
- feed.published = rss_feed.channel.pubDate.nil? ? nil : rss_feed.channel.pubDate.to_datetime # optional
34
+ feed.title = handle_content( rss_feed.channel.title, 'feed.title' ) # required
35
+ feed.summary = handle_content( rss_feed.channel.description, 'feed.description => summary' ) # required
36
+ feed.url = rss_feed.channel.link # required
46
37
 
47
- logger.debug " rss | channel.lastBuildDate: >#{rss_feed.channel.lastBuildDate}< : #{rss_feed.channel.lastBuildDate.class.name}"
48
- logger.debug " rss | channel.pubDate: >#{rss_feed.channel.pubDate}< : #{rss_feed.channel.pubDate.class.name}"
38
+ feed.updated = handle_date( rss_feed.channel.lastBuildDate, 'feed.lastBuildDate => updated' ) # optional
39
+ feed.published = handle_date( rss_feed.channel.pubDate, 'feed.pubDate => published' ) # optional
49
40
 
50
41
 
51
- feed.generator = rss_feed.channel.generator # optional
42
+ feed.generator = rss_feed.channel.generator # optional
52
43
 
53
- logger.debug " rss | channel.generator: >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
44
+ logger.debug " rss | feed.generator >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
54
45
 
55
46
 
56
47
  items = []
@@ -65,13 +56,12 @@ class RssFeedBuilder
65
56
  def build_feed_item( rss_item )
66
57
 
67
58
  item = Item.new
68
- ## item.object = rss_item # not use for now
69
59
 
70
- item.title = rss_item.title
60
+ item.title = handle_content( rss_item.title, 'item.title' )
71
61
  item.url = rss_item.link
72
62
 
73
- logger.debug " rss | item.title: >#{rss_item.title}< : #{rss_item.title.class.name}"
74
- logger.debug " rss | item.link: >#{rss_item.link}< : #{rss_item.link.class.name}"
63
+ logger.debug " rss | item.link >#{rss_item.link}< : #{rss_item.link.class.name}"
64
+
75
65
 
76
66
  ## todo:
77
67
  ## check if feedburner:origLink present - if yes, use it for url/link
@@ -81,24 +71,15 @@ class RssFeedBuilder
81
71
  ## - <link>http://feedproxy.google.com/~r/Rubyflow/~3/Ym9Sltg_2_c/9803-gotta-ruby-s-syntax</link>
82
72
 
83
73
 
84
- item.summary = rss_item.description
74
+ item.summary = handle_content( rss_item.description, 'item.description => summary' )
85
75
 
86
76
  # check for <content:encoded>
87
77
  # -- using RSS 1.0 content module in RSS 2.0
88
78
  item.content = rss_item.content_encoded
89
- logger.debug " rss | item.content_encoded[0..40]: >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
90
-
91
- # NOTE:
92
- # All date-times in RSS conform
93
- # to the Date and Time Specification of RFC 822
94
- # e.g. Sun, 19 May 2012 15:21:36 GMT or
95
- # Sat, 07 Sep 2013 00:00:01 GMT
96
-
97
- ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
79
+ logger.debug " rss | item.content_encoded[0..40] >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
98
80
 
99
- item.published = rss_item.pubDate.nil? ? nil : rss_item.pubDate.to_datetime # .utc.strftime( "%Y-%m-%d %H:%M" )
100
81
 
101
- logger.debug " rss | item.pubDate: >#{rss_item.pubDate}< : #{rss_item.pubDate.class.name}"
82
+ item.updated = handle_date( rss_item.pubDate, 'item.pubDate => updated' )
102
83
 
103
84
 
104
85
  ## fix/todo: check if rss_item.guid present? !!!!
@@ -107,7 +88,7 @@ class RssFeedBuilder
107
88
 
108
89
  if rss_item.guid && rss_item.guid.content
109
90
  item.guid = rss_item.guid.content
110
- logger.debug " rss | item.guid.content: >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
91
+ logger.debug " rss | item.guid.content >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
111
92
  else
112
93
  item.guid = rss_item.link
113
94
  logger.warn " rss | item.guid.content missing !!!! - using link for guid"
@@ -121,9 +102,56 @@ class RssFeedBuilder
121
102
  # <category><![CDATA[Ruby]]></category>
122
103
  # <category><![CDATA[Ruby on Rails]]></category>
123
104
 
124
-
125
105
  item
126
106
  end # method build_feed_item_from_rss
127
107
 
108
+
109
+
110
+ def handle_date( el, name )
111
+ ## change time to utc if present? why? why not?
112
+ # -- .utc.strftime( "%Y-%m-%d %H:%M" )
113
+
114
+ # NOTE:
115
+ # All date-times in RSS conform
116
+ # to the Date and Time Specification of RFC 822
117
+ # e.g. Sun, 19 May 2012 15:21:36 GMT or
118
+ # Sat, 07 Sep 2013 00:00:01 GMT
119
+
120
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
121
+
122
+ logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
123
+
124
+
125
+ ## convert from time to to_datetime (avoid errors on windows w/ builtin rss lib)
126
+ date = if el.nil?
127
+ nil
128
+ else
129
+ el.to_datetime
130
+ end
131
+
132
+ date
133
+ end
134
+
135
+ def handle_content( el, name )
136
+ ## note:
137
+ # use for feed.title, feed.description
138
+ # item.title, item.description
139
+ #
140
+ # do NOT use for others e.g. feed.generator, etc.
141
+
142
+
143
+ ## todo/fix: strip html tags n attributes ???
144
+
145
+ logger.debug " rss | #{name} >#{el}< : #{el.class.name}"
146
+
147
+ text = if el.nil?
148
+ nil
149
+ else
150
+ el.strip
151
+ end
152
+ text
153
+ end
154
+
155
+
128
156
  end # class RssFeedBuilder
129
157
  end # module FeedParser