feedparser 2.0.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 12a89b6c4d0ad5290a16d44a79245a25bb6ff349
4
- data.tar.gz: 63057209038f7cab23eb4fd963d9525b154d2fbb
3
+ metadata.gz: 417617dcf5fa45dfb199a7e692d2602b8dcb9e3f
4
+ data.tar.gz: 0fe1defd0d6bd634e228b067b781b21dcbb7bb3b
5
5
  SHA512:
6
- metadata.gz: 6220dd036c705fedb52a17495001003c88891a912c3ad9e3fed07045adde3f0902d8dcbf3fc7000fbc297b91bba388979c54fbdaf1890f20e541d9216bfc5438
7
- data.tar.gz: 049a3b9cdf4b27fe1fbc2e9109241ed951e4a717e6f620bd6aa7b2b17c9657e1fe468b86fd52cd1bf5ff48192935b3939d6579ca612c5ac576553c2f9bfcb433
6
+ metadata.gz: f0894102ebd9b750c12476782f2305b0d8a40946d2a209bc3e19a865ae32b9b17c210449b86f877d67922ba4ae860ba0e454d4de78b3c50b665e0933490a7b3a
7
+ data.tar.gz: 4b6f400c017e704d85a2ce7ec512532752a905bf15b541d452c5d32a07ec0dcb46e6b80470e5708081b0dae0d203e0a3074555294e826534fd13bef00f887f6f
File without changes
@@ -1,8 +1,9 @@
1
- HISTORY.md
1
+ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
5
  lib/feedparser.rb
6
+ lib/feedparser/attachment.rb
6
7
  lib/feedparser/author.rb
7
8
  lib/feedparser/builder/atom.rb
8
9
  lib/feedparser/builder/json.rb
@@ -13,9 +14,12 @@ lib/feedparser/generator.rb
13
14
  lib/feedparser/item.rb
14
15
  lib/feedparser/parser.rb
15
16
  lib/feedparser/tag.rb
17
+ lib/feedparser/thumbnail.rb
16
18
  lib/feedparser/version.rb
17
19
  test/helper.rb
20
+ test/media_rss_example.txt
18
21
  test/test_atom_live.rb
22
+ test/test_attachments_live.rb
19
23
  test/test_dates.rb
20
24
  test/test_microformats.rb
21
25
  test/test_rss_live.rb
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # feedparser
2
2
 
3
- feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON Feed, HTML h-entry, etc.)
3
+ feedparser gem - web feed parser and normalizer (Atom, RSS, JSON Feed, HTML h-entry, etc.)
4
4
 
5
5
  * home :: [github.com/feedparser/feedparser](https://github.com/feedparser/feedparser)
6
6
  * bugs :: [github.com/feedparser/feedparser/issues](https://github.com/feedparser/feedparser/issues)
@@ -11,6 +11,8 @@ feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON Feed, HTML
11
11
 
12
12
  ## What's News?
13
13
 
14
+ **October/2017**: Added support for attachments / media enclosures in RSS and Atom.
15
+
14
16
  **June/2017**: Added support for reading feeds in HTML with Microformats incl.
15
17
  [`h-entry`](http://microformats.org/wiki/h-entry),
16
18
  [`h-feed`](http://microformats.org/wiki/h-feed) and others.
@@ -32,7 +34,7 @@ See the [Awesome Feeds](https://github.com/feedparser/awesome-feeds) page ».
32
34
 
33
35
  ### Structs
34
36
 
35
- Feed • Item • Author • Tag • Generator
37
+ Feed • Item • Author • Tag • Attachment • Generator
36
38
 
37
39
 
38
40
  ![](feed-models.png)
@@ -158,12 +160,20 @@ end
158
160
 
159
161
  (Source: [`lib/feedparser/tag.rb`](https://github.com/feedparser/feedparser/blob/master/lib/feedparser/tag.rb))
160
162
 
163
+ ### `Attachment` Struct
164
+
165
+ _Also known as Media Enclosure_
166
+
167
+ (Source: [`lib/feedparser/attachment.rb`](https://github.com/feedparser/feedparser/blob/master/lib/feedparser/attachment.rb))
168
+
169
+
161
170
  ### `Generator` Struct
162
171
 
163
172
  (Source: [`lib/feedparser/generator.rb`](https://github.com/feedparser/feedparser/blob/master/lib/feedparser/generator.rb))
164
173
 
165
174
 
166
175
 
176
+
167
177
  ### Read Feed Example
168
178
 
169
179
  ``` ruby
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'feedparser' do
5
5
 
6
6
  self.version = FeedParser::VERSION
7
7
 
8
- self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON Feed, HTML h-entry, etc.)'
8
+ self.summary = 'feedparser - web feed parser and normalizer (RSS, Atom, JSON Feed, HTML h-entry, etc.)'
9
9
  self.description = summary
10
10
 
11
11
  self.urls = ['https://github.com/feedparser/feedparser']
@@ -15,11 +15,12 @@ Hoe.spec 'feedparser' do
15
15
 
16
16
  # switch extension to .markdown for gihub formatting
17
17
  self.readme_file = 'README.md'
18
- self.history_file = 'HISTORY.md'
18
+ self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
21
  ['logutils', '>=0.6.1'],
22
22
  ['textutils', '>=1.0.0'],
23
+ ## ['oga', '>=3.2.0'], note: oga is a "soft" dependency
23
24
  ]
24
25
 
25
26
  ### todo: add fetcher dep for testing (e.g. development only)
@@ -27,7 +28,7 @@ Hoe.spec 'feedparser' do
27
28
  self.licenses = ['Public Domain']
28
29
 
29
30
  self.spec_extras = {
30
- required_ruby_version: '>= 1.9.2'
31
+ required_ruby_version: '>= 2.2.2'
31
32
  }
32
33
 
33
34
  end
@@ -30,6 +30,8 @@ require 'feedparser/feed'
30
30
  require 'feedparser/item'
31
31
  require 'feedparser/author'
32
32
  require 'feedparser/tag'
33
+ require 'feedparser/attachment'
34
+ require 'feedparser/thumbnail'
33
35
  require 'feedparser/generator'
34
36
  require 'feedparser/parser'
35
37
 
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+
3
+ module FeedParser
4
+
5
+ class Attachment ## also known as Enclosure
6
+
7
+ attr_accessor :url
8
+ ## note: uri is an alias for url
9
+ alias :uri :url ## add atom alias for uri - why? why not?
10
+ alias :uri= :url=
11
+
12
+ attr_accessor :length
13
+ attr_accessor :type
14
+
15
+ # Elements from the media namespace attachment
16
+ attr_accessor :title
17
+ attr_accessor :thumbnail
18
+ attr_accessor :description
19
+ attr_accessor :community
20
+
21
+ end # class Attachment
22
+
23
+ end # module FeedParser
@@ -22,7 +22,7 @@ class Author
22
22
  ## or managingEditor (rss) or webMaster (rss) - why? why not??
23
23
 
24
24
  attr_accessor :text # note: holds "unparsed" text (content) line form dc:creator or rss:author
25
-
25
+ alias :line :text # line|text (add str?? too)
26
26
 
27
27
  def to_s
28
28
  ## note: to_s - allows to use just author in templates
@@ -7,13 +7,13 @@ class AtomFeedBuilder
7
7
  include LogUtils::Logging
8
8
 
9
9
 
10
- def self.build( atom_feed )
11
- feed = self.new( atom_feed )
10
+ def self.build( atom_feed, raw )
11
+ feed = self.new( atom_feed, raw )
12
12
  feed.to_feed
13
13
  end
14
14
 
15
- def initialize( atom_feed )
16
- @feed = build_feed( atom_feed )
15
+ def initialize( atom_feed, raw )
16
+ @feed = build_feed( atom_feed, raw )
17
17
  end
18
18
 
19
19
  def to_feed
@@ -22,7 +22,7 @@ class AtomFeedBuilder
22
22
 
23
23
 
24
24
 
25
- def build_feed( atom_feed ) ## fix/todo: rename atom_feed to atom or wire or xml or in ???
25
+ def build_feed( atom_feed, raw ) ## fix/todo: rename atom_feed to atom or wire or xml or in ???
26
26
  feed = Feed.new
27
27
  feed.format = 'atom'
28
28
 
@@ -110,6 +110,16 @@ class AtomFeedBuilder
110
110
  feed.items << build_item( atom_item )
111
111
  end
112
112
 
113
+
114
+ if defined?( Oga )
115
+ # Use Oga as generic xml parser to access elements not adressed by the core RSS module like media:
116
+ parsed_xml = Oga.parse_xml( raw )
117
+ xml_items = parsed_xml.xpath( '/feed/entry' )
118
+ xml_items.each_with_index do |xml_item, i|
119
+ feed.items[i] = add_meta_items( feed.items[i], xml_item )
120
+ end
121
+ end
122
+
113
123
  feed # return new feed
114
124
  end # method build_feed_from_atom
115
125
 
@@ -176,7 +186,7 @@ class AtomFeedBuilder
176
186
  item.updated = item.updated_local.utc
177
187
  end
178
188
 
179
- if atom_item.published && atom_item.published.content
189
+ if atom_item.published && atom_item.published.content
180
190
  item.published_local = handle_date( atom_item.published, 'item.published' )
181
191
  item.published = item.published_local.utc
182
192
  end
@@ -203,10 +213,54 @@ class AtomFeedBuilder
203
213
  item.tags << build_tag( atom_cat )
204
214
  end
205
215
 
216
+
217
+ ## check for attachments / media enclosures
218
+ ### todo/fix: allow more than one attachment/enclosure
219
+ if atom_item.links
220
+ enclosure = atom_item.links.detect{ |x| x.rel == 'enclosure' }
221
+ if enclosure
222
+ attachment = Attachment.new
223
+ attachment.url = enclosure.href
224
+ attachment.length = enclosure.length
225
+ attachment.type = enclosure.type
226
+ item.attachments << attachment
227
+ end
228
+ end
229
+
206
230
  item
207
231
  end # method build_item
208
232
 
209
233
 
234
+ # Add additional elements, currently the media: namespace elements
235
+ # Note: This tries to accomodate both the different ways to transport the data via the spec https://www.rssboard.org/media-rss/ and the practice by Youtube of grouping everything under media:group
236
+ def add_meta_items( feed_item, xml_item )
237
+ if xml_item.at_xpath('media:group') || xml_item.at_xpath('media:title') || xml_item.at_xpath('media:content') || xml_item.at_xpath('media:thumbnail') || xml_item.at_xpath('media:description')
238
+ feed_item.attachments << Attachment.new unless feed_item.attachments.first
239
+
240
+ titleElement = xml_item.at_xpath('media:title') || xml_item.at_xpath('media:content/media:title') || xml_item.at_xpath('media:group/media:title')
241
+ feed_item.attachments.first.title = titleElement.text if titleElement
242
+
243
+ contentElement = xml_item.at_xpath('media:content') || xml_item.at_xpath('media:group/media:content')
244
+ if contentElement
245
+ feed_item.attachments.first.url = contentElement.get('url')
246
+ feed_item.attachments.first.length = contentElement.get('duration')
247
+ end
248
+
249
+ thumbnailElement = xml_item.at_xpath('media:thumbnail') || xml_item.at_xpath('media:content/media:thumbnail') || xml_item.at_xpath('media:group/media:thumbnail')
250
+ if thumbnailElement
251
+ thumbnail = Thumbnail.new
252
+ thumbnail.url = thumbnailElement.get('url')
253
+ thumbnail.width = thumbnailElement.get('width')
254
+ thumbnail.height = thumbnailElement.get('height')
255
+ feed_item.attachments.first.thumbnail = thumbnail
256
+ end
257
+
258
+ descriptionElement = xml_item.at_xpath('media:description') || xml_item.at_xpath('media:content/media:description') || xml_item.at_xpath('media:group/media:description')
259
+ feed_item.attachments.first.description = descriptionElement.text if descriptionElement
260
+ end
261
+ feed_item
262
+ end # method add_meta_items
263
+
210
264
 
211
265
  def handle_date( el, name )
212
266
  ## change time to utc if present? why? why not?
@@ -10,13 +10,13 @@ class RssFeedBuilder
10
10
  include LogUtils::Logging
11
11
 
12
12
 
13
- def self.build( rss_feed )
14
- feed = self.new( rss_feed )
13
+ def self.build( rss_feed, raw )
14
+ feed = self.new( rss_feed, raw )
15
15
  feed.to_feed
16
16
  end
17
17
 
18
- def initialize( rss_feed )
19
- @feed = build_feed( rss_feed )
18
+ def initialize( rss_feed, raw )
19
+ @feed = build_feed( rss_feed, raw )
20
20
  end
21
21
 
22
22
  def to_feed
@@ -25,7 +25,7 @@ class RssFeedBuilder
25
25
 
26
26
 
27
27
 
28
- def build_feed( rss_feed )
28
+ def build_feed( rss_feed, raw )
29
29
  feed = Feed.new
30
30
  feed.format = "rss #{rss_feed.rss_version}"
31
31
 
@@ -35,29 +35,42 @@ class RssFeedBuilder
35
35
  feed.summary = handle_content( rss_feed.channel.description, 'feed.description => summary' ) # required
36
36
  feed.url = rss_feed.channel.link # required
37
37
 
38
- feed.updated_local = handle_date( rss_feed.channel.lastBuildDate, 'feed.lastBuildDate => updated' ) # optional
38
+ begin
39
+ feed.updated_local = handle_date( rss_feed.channel.lastBuildDate, 'feed.lastBuildDate => updated' ) # optional
40
+ rescue
41
+ end
39
42
  feed.updated = feed.updated_local.utc if feed.updated_local
40
43
 
41
- feed.published_local = handle_date( rss_feed.channel.pubDate, 'feed.pubDate => published' ) # optional
44
+ begin
45
+ feed.published_local = handle_date( rss_feed.channel.pubDate, 'feed.pubDate => published' ) # optional
46
+ rescue
47
+ end
42
48
  feed.published = feed.published_local.utc if feed.published_local
43
49
 
44
- logger.debug " rss | feed.generator >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
50
+ begin
51
+ logger.debug " rss | feed.generator >#{rss_feed.channel.generator}< : #{rss_feed.channel.generator.class.name}"
52
+ rescue
53
+ end
45
54
 
46
- feed.generator.text = rss_feed.channel.generator # optional
55
+ begin
56
+ feed.generator.text = rss_feed.channel.generator # optional
57
+ rescue
58
+ end
47
59
  feed.generator.name = feed.generator.text ## note: for now set also name/title to "unparsed" (content) line (may change in the future!!!)
48
60
 
49
61
 
50
62
 
51
63
  ## check for managingEditor and/or webMaster
52
64
 
53
- if rss_feed.channel.managingEditor
65
+ if rss_feed.channel.respond_to?(:managingEditor) && rss_feed.channel.managingEditor
54
66
  author = Author.new
55
67
  author.text = rss_feed.channel.managingEditor.strip
56
68
  author.name = author.text ## note: for now use "unparsed" (content) line also for name
57
69
  feed.authors << author
58
70
  end
59
71
 
60
- if rss_feed.channel.webMaster
72
+ ## todo/check - if tag is called webmaster or webMaster ???
73
+ if rss_feed.channel.respond_to?(:webMaster) && rss_feed.channel.webMaster
61
74
  author = Author.new
62
75
  author.text = rss_feed.channel.webMaster.strip
63
76
  author.name = author.text ## note: for now use "unparsed" (content) line also for name
@@ -76,9 +89,10 @@ class RssFeedBuilder
76
89
 
77
90
 
78
91
  ### check for categories (tags)
79
-
80
- rss_feed.channel.categories.each do |rss_cat|
81
- feed.tags << build_tag( rss_cat )
92
+ if rss_feed.channel.respond_to?(:categories)
93
+ rss_feed.channel.categories.each do |rss_cat|
94
+ feed.tags << build_tag( rss_cat )
95
+ end
82
96
  end
83
97
 
84
98
 
@@ -86,6 +100,14 @@ class RssFeedBuilder
86
100
  feed.items << build_item( rss_item )
87
101
  end
88
102
 
103
+ if defined?( Oga )
104
+ parsed_xml = Oga.parse_xml( raw )
105
+ xml_items = parsed_xml.xpath( '/rss/channel/item' )
106
+ xml_items.each_with_index do |xml_item, i|
107
+ feed.items[i] = add_meta_items( feed.items[i], xml_item )
108
+ end
109
+ end
110
+
89
111
  feed # return new feed
90
112
  end
91
113
 
@@ -138,16 +160,18 @@ class RssFeedBuilder
138
160
  item.content = rss_item.content_encoded
139
161
  logger.debug " rss | item.content_encoded[0..40] >#{rss_item.content_encoded ? rss_item.content_encoded[0..40] : ''}< : #{rss_item.content_encoded.class.name}"
140
162
 
141
-
142
- item.updated_local = handle_date( rss_item.pubDate, 'item.pubDate => updated' )
143
- item.updated = item.updated_local.utc if item.updated_local
163
+ begin
164
+ item.published_local = handle_date( rss_item.pubDate, 'item.pubDate => published' )
165
+ rescue
166
+ end
167
+ item.published = item.published_local.utc if item.published_local
144
168
 
145
169
 
146
170
  ## fix/todo: check if rss_item.guid present? !!!!
147
171
  ##
148
172
  ## might be the case e.g. check lambda-the-ultimate.org, for example
149
173
 
150
- if rss_item.guid && rss_item.guid.content
174
+ if rss_item.respond_to?(:guid) && rss_item.guid && rss_item.guid.content
151
175
  item.guid = rss_item.guid.content
152
176
  logger.debug " rss | item.guid.content >#{rss_item.guid.content}< : #{rss_item.guid.content.class.name}"
153
177
  else
@@ -156,7 +180,7 @@ class RssFeedBuilder
156
180
  end
157
181
 
158
182
 
159
- if rss_item.author
183
+ if rss_item.respond_to?(:author) && rss_item.author
160
184
  author = Author.new
161
185
  author.text = rss_item.author.strip
162
186
  author.name = author.text ## note: for now use "unparsed" (content) line also for name
@@ -173,18 +197,68 @@ class RssFeedBuilder
173
197
  item.authors = authors
174
198
  end
175
199
 
200
+ unless item.published_local
201
+ # use dc_date only of no regular item date was given
202
+ begin
203
+ item.published_local = handle_date( rss_item.dc_date, 'item.dc_date => published' )
204
+ rescue
205
+ end
206
+ item.published = item.published_local.utc if item.published_local
207
+ end
176
208
 
177
209
  ### check for categories (tags)
178
-
179
- rss_item.categories.each do |rss_cat|
180
- item.tags << build_tag( rss_cat )
210
+ if rss_item.respond_to?(:categories)
211
+ rss_item.categories.each do |rss_cat|
212
+ item.tags << build_tag( rss_cat )
213
+ end
181
214
  end
182
215
 
183
216
 
217
+ ## check for enclosure
218
+ ## todo/check: rss can only include at most one enclosure?
219
+
220
+ if rss_item.respond_to?(:enclosure) && rss_item.enclosure
221
+ attachment = Attachment.new
222
+ attachment.url = rss_item.enclosure.url
223
+ attachment.length = rss_item.enclosure.length
224
+ attachment.type = rss_item.enclosure.type
225
+ item.attachments << attachment
226
+ end
227
+
184
228
  item
185
229
  end # method build_feed_item_from_rss
186
230
 
187
231
 
232
+ # Add additional elements, currently the media: namespace elements
233
+ def add_meta_items( feed_item, xml_item )
234
+ if xml_item.at_xpath('media:group') || xml_item.at_xpath('media:title') || xml_item.at_xpath('media:content') || xml_item.at_xpath('media:thumbnail') || xml_item.at_xpath('media:description')
235
+ feed_item.attachments << Attachment.new unless feed_item.attachments.first
236
+
237
+ titleElement = xml_item.at_xpath('media:title') || xml_item.at_xpath('media:content/media:title') || xml_item.at_xpath('media:group/media:title')
238
+ feed_item.attachments.first.title = titleElement.text if titleElement
239
+
240
+ contentElement = xml_item.at_xpath('media:content') || xml_item.at_xpath('media:group/media:content')
241
+ if contentElement
242
+ feed_item.attachments.first.url = contentElement.get('url')
243
+ feed_item.attachments.first.length = contentElement.get('duration')
244
+ end
245
+
246
+ thumbnailElement = xml_item.at_xpath('media:thumbnail') || xml_item.at_xpath('media:content/media:thumbnail') || xml_item.at_xpath('media:group/media:thumbnail')
247
+ if thumbnailElement
248
+ thumbnail = Thumbnail.new
249
+ thumbnail.url = thumbnailElement.get('url')
250
+ thumbnail.width = thumbnailElement.get('width')
251
+ thumbnail.height = thumbnailElement.get('height')
252
+ feed_item.attachments.first.thumbnail = thumbnail
253
+ end
254
+
255
+ descriptionElement = xml_item.at_xpath('media:description') || xml_item.at_xpath('media:content/media:description') || xml_item.at_xpath('media:group/media:description')
256
+ feed_item.attachments.first.description = descriptionElement.text if descriptionElement
257
+ end
258
+ feed_item
259
+ end # method add_meta_items
260
+
261
+
188
262
 
189
263
  def handle_date( el, name )
190
264
  ## change time to utc if present? why? why not?
@@ -25,10 +25,19 @@ class Feed
25
25
  attr_accessor :tags
26
26
  def tags?() @tags && @tags.size > 0; end
27
27
 
28
+ ## add alias category for tags (remove - why? why not?)
29
+ alias :categories :tags
30
+
28
31
 
29
32
  def summary?() @summary.nil? == false; end
30
33
  attr_accessor :summary # e.g. description (rss)|subtitle (atom)
31
34
 
35
+ ## add description as alias for summary (remove - why? why not?)
36
+ alias :description :summary
37
+ alias :description= :summary=
38
+ alias :description? :summary?
39
+
40
+
32
41
  ##
33
42
  ## todo/check/fix:
34
43
  ## use a extra field for atom subtitle
@@ -47,10 +56,16 @@ class Feed
47
56
  attr_accessor :updated # e.g. lastBuildDate (rss)|updated (atom) -- always (converted) to utc
48
57
  attr_accessor :updated_local # "unparsed" local datetime as in feed (NOT converted to utc)
49
58
 
59
+ attr_accessor :updated_text # string version of date
60
+ alias :updated_line :updated_text # text|line - convention for "unparsed" 1:1 from feed; add str(too ??)
61
+
50
62
  def published?() @published.nil? == false; end
51
63
  attr_accessor :published # e.g. pubDate (rss)\n/a (atom) -- note: published is basically an alias for created
52
64
  attr_accessor :published_local # "unparsed" local datetime as in feed (NOT converted to utc)
53
65
 
66
+ attr_accessor :published_text # string version of date
67
+ alias :published_line :published_text # text|line - convention for "unparsed" 1:1 from feed; add str(too ??)
68
+
54
69
 
55
70
  attr_accessor :generator
56
71
 
@@ -18,6 +18,8 @@ class Generator
18
18
 
19
19
 
20
20
  attr_accessor :text # note: holds "unparsed" text (content) line form rss:generator
21
+ alias :line :text # line|text (add str?? too)
22
+
21
23
 
22
24
  def to_s
23
25
  ## note: to_s - allows to use just generator in templates
@@ -35,15 +35,27 @@ class Item
35
35
  def summary?() @summary.nil? == false; end
36
36
  attr_accessor :summary
37
37
 
38
+ ## add description as alias for summary (remove - why? why not?)
39
+ alias :description :summary
40
+ alias :description= :summary=
41
+ alias :description? :summary?
42
+
43
+
38
44
 
39
45
  def updated?() @updated.nil? == false; end
40
46
  attr_accessor :updated # pubDate (RSS)|updated (Atom)
41
47
  attr_accessor :updated_local # "unparsed" local datetime as in feed (NOT converted to utc)
42
48
 
49
+ attr_accessor :updated_text # string version of date
50
+ alias :updated_line :updated_text # text|line - convention for "unparsed" 1:1 from feed; add str(too ??)
51
+
52
+
43
53
  def published?() @published.nil? == false; end
44
54
  attr_accessor :published # note: published is basically an alias for created
45
55
  attr_accessor :published_local # "unparsed" local datetime as in feed (NOT converted to utc)
46
56
 
57
+ attr_accessor :published_text # string version of date
58
+ alias :published_line :published_text # text|line - convention for "unparsed" 1:1 from feed; add str(too ??)
47
59
 
48
60
 
49
61
  attr_accessor :id
@@ -67,10 +79,28 @@ class Item
67
79
  attr_accessor :tags
68
80
  def tags?() @tags && @tags.size > 0; end
69
81
 
82
+ alias :categories :tags # for now allow categories alias for tags - remove (why? why not?)
83
+
84
+
85
+ # add attachments/media enclosures (url, length and type)
86
+ # note: lets support more than one (it's an array)
87
+ attr_accessor :attachments
88
+
89
+ def attachment() @attachments[0]; end
90
+ def attachments?() @attachments && @attachments.size > 0; end
91
+ alias :attachment? :attachments?
92
+
93
+ alias :enclosures :attachments
94
+ alias :enclosure :attachment
95
+ alias :enclosures? :attachments?
96
+ alias :enclosure? :attachments?
97
+
98
+
70
99
  def initialize
71
100
  ## note: make authors, tags empty arrays on startup (e.g. not nil)
72
- @authors = []
73
- @tags = []
101
+ @authors = []
102
+ @tags = []
103
+ @attachments = []
74
104
  end
75
105
 
76
106
  end # class Item
@@ -16,38 +16,53 @@ class Parser
16
16
  ### Note: lets keep/use same API as RSS::Parser for now
17
17
  def initialize( text )
18
18
  @text = text
19
+ @head = @text[0..100].strip # note: remove leading spaces if present
19
20
  end
20
21
 
21
22
 
22
23
 
23
- def parse
24
- head = @text[0..100].strip # note: remove leading spaces if present
25
-
26
- jsonfeed_version_regex = %r{"version":\s*"https://jsonfeed.org/version/1"}
24
+ #### note:
25
+ # make format checks callable from outside (that is, use builtin helper methods)
27
26
 
27
+ def is_xml?
28
28
  ## check if starts with knownn xml prologs
29
- if head.start_with?( '<?xml' ) ||
30
- head.start_with?( '<feed/' ) ||
31
- head.start_with?( '<rss/' )
29
+ @head.start_with?( '<?xml' ) ||
30
+ @head.start_with?( '<feed' ) ||
31
+ @head.start_with?( '<rss' )
32
+ end
33
+ alias_method :xml?, :is_xml?
34
+
35
+ JSONFEED_VERSION_RE = %r{"version":\s*"https://jsonfeed.org/version/1"}
36
+ def is_json?
32
37
  ## check if starts with { for json object/hash
33
38
  ## or if includes jsonfeed prolog
39
+ @head.start_with?( '{' ) ||
40
+ @head =~ JSONFEED_VERSION_RE
41
+ end
42
+ alias_method :json?, :is_json?
43
+
44
+ def is_microformats?
45
+ # for now check for microformats v2 (e.g. h-entry, h-feed)
46
+ # check for v1 too - why? why not? (e.g. hentry, hatom ??)
47
+ @text.include?( 'h-entry' ) ||
48
+ @text.include?( 'h-feed' )
49
+ end
50
+ alias_method :microformats?, :is_microformats?
51
+
52
+
53
+
54
+ def parse
55
+ if is_xml?
34
56
  parse_xml
35
- elsif head.start_with?( '{' ) ||
36
- head =~ jsonfeed_version_regex
57
+ elsif is_json?
37
58
  parse_json
38
59
  ## note: reading/parsing microformat is for now optional
39
60
  ## microformats gem requires nokogiri
40
61
  ## nokogiri (uses libxml c-extensions) makes it hard to install (sometime)
41
62
  ## thus, if you want to use it, please opt-in to keep the install "light"
42
- #
43
- # for now check for microformats v2 (e.g. h-entry, h-feed)
44
- # check for v1 too - why? why not? (e.g. hentry, hatom ??)
45
- elsif defined?( Microformats ) &&
46
- (@text.include?( 'h-entry' ) ||
47
- @text.include?( 'h-feed' )
48
- )
49
- parse_microformats
50
- else ## assume xml for now
63
+ elsif defined?( Microformats ) && is_microformats?
64
+ parse_microformats
65
+ else ## fallback - assume xml for now
51
66
  parse_xml
52
67
  end
53
68
  end # method parse
@@ -95,9 +110,9 @@ class Parser
95
110
  logger.debug " feed.class=#{feed_wild.class.name}"
96
111
 
97
112
  if feed_wild.is_a?( RSS::Atom::Feed )
98
- feed = AtomFeedBuilder.build( feed_wild )
113
+ feed = AtomFeedBuilder.build( feed_wild, @text )
99
114
  else # -- assume RSS::Rss::Feed
100
- feed = RssFeedBuilder.build( feed_wild )
115
+ feed = RssFeedBuilder.build( feed_wild, @text )
101
116
  end
102
117
 
103
118
  logger.debug "== #{feed.format} / #{feed.title} =="
@@ -0,0 +1,21 @@
1
+ # encoding: utf-8
2
+
3
+ module FeedParser
4
+
5
+ class Thumbnail
6
+
7
+ attr_accessor :url
8
+
9
+ ## note: uri is an alias for url
10
+ alias :uri :url ## add atom alias for uri - why? why not?
11
+ alias :uri= :url=
12
+
13
+ def width?() @width.nil? == false; end
14
+ attr_accessor :width
15
+
16
+ def height?() @height.nil? == false; end
17
+ attr_accessor :height # todo/check: use avatar_url ?? used by json feed -check if always a url
18
+
19
+ end # class Thumbnail
20
+
21
+ end # module FeedParser
@@ -3,7 +3,7 @@
3
3
  module FeedParser
4
4
 
5
5
  MAJOR = 2
6
- MINOR = 0
6
+ MINOR = 2
7
7
  PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
@@ -0,0 +1,53 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" version="2.0">
3
+ <channel>
4
+ <title>Calm Meditation</title>
5
+ <link>http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com</link>
6
+ <language>en-us</language>
7
+ <pubDate>Mon, 02 Apr 2018 16:19:56 -0700</pubDate>
8
+ <lastBuildDate>Mon, 02 Apr 2018 16:19:56 -0700</lastBuildDate>
9
+ <managingEditor>tomjoht@gmail.com (Tom Johnson)</managingEditor>
10
+ <description>Contains short videos capturing still scenes from nature with a music background, intended for calming or meditation purposes. When you're stressed out or upset, watch a few videos. As your mind focuses on the small details, let your worries and frustrations float away. The purpose is not to entertain or to distract, but to help calm, soothe, and surface your inner quiet. The videos contain scenes from the San Tomas Aquinas trail in Santa Clara, California.</description>
11
+ <image>
12
+ <link>http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com</link>
13
+ <title>Calm Meditation</title>
14
+ <url>http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/images/calmmeditationlogo_small.png</url>
15
+ <description>Contains short videos capturing still scenes from nature with a music background, intended for calming or meditation purposes. When you're stressed out or upset, watch a few videos. As your mind focuses on the small details, let your worries and frustrations float away. The purpose is not to entertain or to distract, but to help calm, soothe, and surface your inner quiet. The videos contain scenes from the San Tomas Aquinas trail in Santa Clara, California.</description>
16
+ <height>114</height>
17
+ <width>114</width>
18
+ </image>
19
+ <atom:link href="http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/feed.xml" rel="self" type="application/rss+xml" />
20
+ <item>
21
+ <title>Shade</title>
22
+ <pubDate>Mon, 23 Oct 2017 00:00:00 -0700</pubDate>
23
+ <link>http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/shade/</link>
24
+ <description>Quiet the mind, and the soul will speak. - Ma Jaya Sati Bhagavati</description>
25
+ <guid isPermaLink="false">http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/shade/</guid>
26
+ <media:category>All</media:category>
27
+ <media:category>Trail</media:category>
28
+ <media:content url="http://d1nixf144dcz0j.cloudfront.net/shade.mp4" language="en-us" fileSize="37000000" duration="120.0" medium="video" isDefault="true">
29
+ <media:title type="plain">Shade</media:title>
30
+ <media:description type="html">Quiet the mind, and the soul will speak. - Ma Jaya Sati Bhagavati</media:description>
31
+ <media:thumbnail url="http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/images/thumbs/shade.jpg" />
32
+ <media:credit role="author" scheme="urn:ebu">Tom Johnson</media:credit>
33
+ <media:copyright url="https://creativecommons.org/licenses/by/4.0/" />
34
+ </media:content>
35
+ </item>
36
+ <item>
37
+ <title>Spectators</title>
38
+ <pubDate>Thu, 12 Oct 2017 00:00:00 -0700</pubDate>
39
+ <link>http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/spectators/</link>
40
+ <description>"Your worst enemy cannot harm you as much as your own thoughts, unguarded." – Buddha</description>
41
+ <guid isPermaLink="false">http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/spectators/</guid>
42
+ <media:category>All</media:category>
43
+ <media:category>Grass</media:category>
44
+ <media:content url="http://d1nixf144dcz0j.cloudfront.net/spectators.mp4" language="en-us" fileSize="19000000" duration="120.0" medium="video" isDefault="true">
45
+ <media:title type="plain">Spectators</media:title>
46
+ <media:description type="html">"Your worst enemy cannot harm you as much as your own thoughts, unguarded." – Buddha</media:description>
47
+ <media:thumbnail url="http://sample-firetv-web-app.s3-website-us-west-2.amazonaws.com/images/thumbs/spectators.jpg" />
48
+ <media:credit role="author" scheme="urn:ebu">Tom Johnson</media:credit>
49
+ <media:copyright url="https://creativecommons.org/licenses/by/4.0/" />
50
+ </media:content>
51
+ </item>
52
+ </channel>
53
+ </rss>
@@ -4,6 +4,8 @@
4
4
  # or better
5
5
  # rake test
6
6
 
7
+
8
+
7
9
  require 'helper'
8
10
 
9
11
  class TestAtomLive < MiniTest::Test
@@ -11,36 +13,48 @@ class TestAtomLive < MiniTest::Test
11
13
  def test_rubyonrails
12
14
  feed = fetch_and_parse_feed( 'http://weblog.rubyonrails.org/feed/atom.xml' )
13
15
 
14
- assert_equal 'atom', feed.format
15
- assert_equal 'http://weblog.rubyonrails.org/', feed.url
16
+ assert_equal 'atom', feed.format
17
+ assert_equal 'https://weblog.rubyonrails.org/', feed.url
18
+ ## note was (2020/1): 'http://weblog.rubyonrails.org/', feed.url
16
19
  end
17
20
 
18
21
 
19
22
  def test_railstutorial
20
23
  feed = fetch_and_parse_feed( 'http://feeds.feedburner.com/railstutorial?format=xml' )
21
24
 
22
- assert_equal 'atom', feed.format
23
- assert_equal 'http://news.learnenough.com/', feed.url
25
+ assert_equal 'atom', feed.format
26
+ assert_equal 'https://news.learnenough.com/', feed.url
27
+ ## note was (2020/1): assert_equal 'http://news.learnenough.com/', feed.url
24
28
  ## note was (2017/5): assert_equal 'http://news.railstutorial.org/', feed.url
25
29
  end
26
30
 
27
31
 
32
+ =begin
33
+ ### returns ssl error e.g.
34
+ ## OpenSSL::SSL::SSLError: SSL_connect SYSCALL returned=5 errno=0 state=SSLv2/v3 read server
28
35
  def test_googlegroup
29
36
  feed = fetch_and_parse_feed( 'https://groups.google.com/forum/feed/beerdb/topics/atom.xml?num=15' )
30
37
 
31
38
  assert_equal 'atom', feed.format
32
39
  assert_equal 'https://groups.google.com/d/forum/beerdb', feed.url
33
40
  end
41
+ =end
34
42
 
35
43
 
36
44
  def test_headius
37
- feed = fetch_and_parse_feed( 'http://blog.headius.com/feeds/posts/default' )
45
+ feed = fetch_and_parse_feed( 'http://blog.headius.com/feed.xml' )
46
+ ## note was (2020/1): 'http://blog.headius.com/feeds/posts/default'
38
47
 
39
48
  assert_equal 'atom', feed.format
40
- assert_equal 'Blogger', feed.generator.name
41
- assert_equal 'Headius', feed.title
42
- assert_equal 'Helping the JVM Into the 21st Century', feed.summary # aka subtitle in atom
43
- assert_equal 'http://blog.headius.com/', feed.url
49
+ assert_equal 'Jekyll', feed.generator.name
50
+ ## note was (2020/1): 'Blogger'
51
+
52
+ assert_equal 'Charles Oliver Nutter', feed.title
53
+ ## note was (2020/1): 'Headius', feed.title
54
+ assert_equal 'Java, Ruby, and JVM guy trying to make sense of it all', feed.summary # aka subtitle in atom
55
+ ## note was (2020/1): 'Helping the JVM Into the 21st Century', feed.title
56
+ assert_equal 'https://headius.github.io/', feed.url
57
+ ## note was (2020/1): 'http://blog.headius.com/'
44
58
  end
45
59
 
46
60
  end
@@ -0,0 +1,69 @@
1
+ ###
2
+ # to run use
3
+ # ruby -I ./lib -I ./test test/test_attachments_live.rb
4
+ # or better
5
+ # rake test
6
+
7
+ require 'helper'
8
+
9
+
10
+ ###
11
+ ## note: needs to require oga gem (it's not required by default - it's a "soft" dependency)
12
+
13
+ require 'oga'
14
+
15
+
16
+
17
+ class TestAttachmentsLive < MiniTest::Test
18
+
19
+ def test_atom_enclose
20
+ feed = fetch_and_parse_feed( 'http://www.lse.ac.uk/assets/richmedia/webFeeds/publicLecturesAndEvents_AtomAllMediaTypesLatest100.xml' )
21
+
22
+ assert_equal 'audio/mpeg', feed.items.first.attachment.type
23
+ assert_equal 'audio/mpeg', feed.items.first.enclosure.type
24
+
25
+ assert_equal true, feed.items.first.attachment?
26
+ assert_equal true, feed.items.first.enclosure?
27
+ end
28
+
29
+ def test_atom_media
30
+ feed = fetch_and_parse_feed( 'http://www.youtube.com/feeds/videos.xml?channel_id=UCZUT79WUUpZlZ-XMF7l4CFg' )
31
+ assert_equal true, feed.items.first.attachment?
32
+ assert feed.items.first.attachments.first.title
33
+ assert feed.items.first.attachments.first.url
34
+ assert feed.items.first.attachments.first.thumbnail
35
+ assert_instance_of FeedParser::Thumbnail, feed.items.first.attachments.first.thumbnail
36
+ assert feed.items.first.attachments.first.thumbnail.url
37
+ assert_equal 480, feed.items.first.attachments.first.thumbnail.width.to_i
38
+ assert_equal 360, feed.items.first.attachments.first.thumbnail.height.to_i
39
+ assert feed.items.first.attachments.first.description
40
+ end
41
+
42
+ def test_rss_media
43
+ # tests an example RSS file from https://creator.amazon.com/documentation/ac/mrss.html. Not that unlike the Atom example, it does
44
+ # does not put everything under media:group
45
+ testpath = File.join(File.expand_path(File.dirname(__FILE__)), 'media_rss_example.txt')
46
+ feed_rss = File.read( testpath )
47
+ feed = FeedParser::Parser.parse( feed_rss )
48
+ assert_equal true, feed.items.first.attachment?
49
+ assert feed.items.first.attachments.first.title
50
+ assert feed.items.first.attachments.first.url
51
+ assert feed.items.first.attachments.first.thumbnail
52
+ assert_instance_of FeedParser::Thumbnail, feed.items.first.attachments.first.thumbnail
53
+ assert feed.items.first.attachments.first.thumbnail.url
54
+ assert_nil feed.items.first.attachments.first.thumbnail.width
55
+ assert_nil feed.items.first.attachments.first.thumbnail.height
56
+ assert feed.items.first.attachments.first.description
57
+ end
58
+
59
+ def test_rss_enclosure
60
+ feed = fetch_and_parse_feed( 'http://www.radiofreesatan.com/category/featured/feed/' )
61
+
62
+ assert_equal 'audio/mpeg', feed.items.first.attachment.type
63
+ assert_equal 'audio/mpeg', feed.items.first.enclosure.type
64
+
65
+ assert_equal true, feed.items.first.attachment?
66
+ assert_equal true, feed.items.first.enclosure?
67
+ end
68
+
69
+ end
@@ -31,7 +31,7 @@ class TestRssLive < MiniTest::Test
31
31
  def test_rubymine
32
32
  # includes item/content:encoded
33
33
  feed = fetch_and_parse_feed( 'http://feeds.feedburner.com/jetbrains_rubymine?format=xml' )
34
-
34
+
35
35
  assert_equal 'rss 2.0', feed.format
36
36
  end
37
37
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-08 00:00:00.000000000 Z
11
+ date: 2020-05-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils
@@ -58,29 +58,30 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.15'
61
+ version: '3.16'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.15'
69
- description: feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON Feed,
70
- HTML h-entry, etc.)
68
+ version: '3.16'
69
+ description: feedparser - web feed parser and normalizer (RSS, Atom, JSON Feed, HTML
70
+ h-entry, etc.)
71
71
  email: wwwmake@googlegroups.com
72
72
  executables: []
73
73
  extensions: []
74
74
  extra_rdoc_files:
75
- - HISTORY.md
75
+ - CHANGELOG.md
76
76
  - Manifest.txt
77
77
  - README.md
78
78
  files:
79
- - HISTORY.md
79
+ - CHANGELOG.md
80
80
  - Manifest.txt
81
81
  - README.md
82
82
  - Rakefile
83
83
  - lib/feedparser.rb
84
+ - lib/feedparser/attachment.rb
84
85
  - lib/feedparser/author.rb
85
86
  - lib/feedparser/builder/atom.rb
86
87
  - lib/feedparser/builder/json.rb
@@ -91,9 +92,12 @@ files:
91
92
  - lib/feedparser/item.rb
92
93
  - lib/feedparser/parser.rb
93
94
  - lib/feedparser/tag.rb
95
+ - lib/feedparser/thumbnail.rb
94
96
  - lib/feedparser/version.rb
95
97
  - test/helper.rb
98
+ - test/media_rss_example.txt
96
99
  - test/test_atom_live.rb
100
+ - test/test_attachments_live.rb
97
101
  - test/test_dates.rb
98
102
  - test/test_microformats.rb
99
103
  - test/test_rss_live.rb
@@ -111,7 +115,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
111
115
  requirements:
112
116
  - - ">="
113
117
  - !ruby/object:Gem::Version
114
- version: 1.9.2
118
+ version: 2.2.2
115
119
  required_rubygems_version: !ruby/object:Gem::Requirement
116
120
  requirements:
117
121
  - - ">="
@@ -119,9 +123,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
123
  version: '0'
120
124
  requirements: []
121
125
  rubyforge_project:
122
- rubygems_version: 2.6.7
126
+ rubygems_version: 2.5.2
123
127
  signing_key:
124
128
  specification_version: 4
125
- summary: feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON Feed, HTML
126
- h-entry, etc.)
129
+ summary: feedparser - web feed parser and normalizer (RSS, Atom, JSON Feed, HTML h-entry,
130
+ etc.)
127
131
  test_files: []