feedparser 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/Manifest.txt +2 -50
  3. data/README.md +71 -9
  4. data/Rakefile +1 -1
  5. data/lib/feedparser.rb +2 -0
  6. data/lib/feedparser/builder/microformats.rb +264 -0
  7. data/lib/feedparser/parser.rb +27 -0
  8. data/lib/feedparser/version.rb +2 -2
  9. data/test/helper.rb +3 -57
  10. data/test/test_microformats.rb +52 -0
  11. metadata +10 -56
  12. data/test/feeds/books/nostarch.rss +0 -125
  13. data/test/feeds/books/oreilly.feedburner.atom +0 -387
  14. data/test/feeds/books/pragprog.rss +0 -148
  15. data/test/feeds/byparker.json +0 -643
  16. data/test/feeds/daringfireball.atom +0 -1873
  17. data/test/feeds/daringfireball.json +0 -619
  18. data/test/feeds/googlegroups.atom +0 -37
  19. data/test/feeds/googlegroups2.atom +0 -27
  20. data/test/feeds/headius.atom +0 -123
  21. data/test/feeds/inessential.json +0 -182
  22. data/test/feeds/intertwingly.atom +0 -1197
  23. data/test/feeds/jsonfeed.json +0 -37
  24. data/test/feeds/lambdatheultimate.rss +0 -288
  25. data/test/feeds/learnenough.feedburner.atom +0 -747
  26. data/test/feeds/news/nytimes-blogs-bits.rss +0 -333
  27. data/test/feeds/news/nytimes-paul-krugman.rss +0 -60
  28. data/test/feeds/news/nytimes-tech.rss +0 -653
  29. data/test/feeds/news/nytimes-thomas-l-friedman.rss +0 -80
  30. data/test/feeds/news/nytimes.rss +0 -607
  31. data/test/feeds/news/washingtonpost-blogs-innovations.rss +0 -183
  32. data/test/feeds/news/washingtonpost-politics.rss +0 -35
  33. data/test/feeds/news/washingtonpost-world.rss +0 -29
  34. data/test/feeds/ongoing.atom +0 -1619
  35. data/test/feeds/osm/blog.openstreetmap.rss +0 -252
  36. data/test/feeds/osm/blogs.openstreetmap.rss +0 -585
  37. data/test/feeds/osm/mapbox.rss +0 -1883
  38. data/test/feeds/railstutorial.feedburner.atom +0 -656
  39. data/test/feeds/rubyflow.feedburner.rss +0 -120
  40. data/test/feeds/rubymine.feedburner.rss +0 -314
  41. data/test/feeds/rubyonrails.atom +0 -1241
  42. data/test/feeds/scripting.rss +0 -881
  43. data/test/feeds/sitepoint.rss +0 -218
  44. data/test/feeds/spec/atom/author.atom +0 -48
  45. data/test/feeds/spec/atom/authors.atom +0 -70
  46. data/test/feeds/spec/atom/categories.atom +0 -66
  47. data/test/feeds/spec/json/example.json +0 -36
  48. data/test/feeds/spec/json/microblog.json +0 -43
  49. data/test/feeds/spec/json/tags.json +0 -33
  50. data/test/feeds/spec/rss/author.rss +0 -41
  51. data/test/feeds/spec/rss/categories.rss +0 -64
  52. data/test/feeds/spec/rss/creator.rss +0 -38
  53. data/test/feeds/xkcd.atom +0 -48
  54. data/test/feeds/xkcd.rss +0 -55
  55. data/test/test_atom.rb +0 -27
  56. data/test/test_authors.rb +0 -26
  57. data/test/test_books.rb +0 -25
  58. data/test/test_feeds.rb +0 -29
  59. data/test/test_json.rb +0 -27
  60. data/test/test_rss.rb +0 -26
  61. data/test/test_tags.rb +0 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 520ebca7dbff20c3347536869aa6dcfd87d68ddc
4
- data.tar.gz: 5378e9fc3d8038ed61d085999fcb7d1ab3e1f041
3
+ metadata.gz: 12a89b6c4d0ad5290a16d44a79245a25bb6ff349
4
+ data.tar.gz: 63057209038f7cab23eb4fd963d9525b154d2fbb
5
5
  SHA512:
6
- metadata.gz: 40cc695312a0007da9e569fd05dd35d6621e08367248191eea645c77a93ed5e64c58c0febd9b9d5762177751b3e9c712adbcd52c1ec951e9125c085719f37734
7
- data.tar.gz: 96fb8f2c92a83447cc5598b667a3a5036d66c93305b3a624a9f6081e4489b43f5bf37b51cc61959f0ad1a621bd6d1c456a514a1c501e7112179f00d6f1c55cfb
6
+ metadata.gz: 6220dd036c705fedb52a17495001003c88891a912c3ad9e3fed07045adde3f0902d8dcbf3fc7000fbc297b91bba388979c54fbdaf1890f20e541d9216bfc5438
7
+ data.tar.gz: 049a3b9cdf4b27fe1fbc2e9109241ed951e4a717e6f620bd6aa7b2b17c9657e1fe468b86fd52cd1bf5ff48192935b3939d6579ca612c5ac576553c2f9bfcb433
@@ -6,6 +6,7 @@ lib/feedparser.rb
6
6
  lib/feedparser/author.rb
7
7
  lib/feedparser/builder/atom.rb
8
8
  lib/feedparser/builder/json.rb
9
+ lib/feedparser/builder/microformats.rb
9
10
  lib/feedparser/builder/rss.rb
10
11
  lib/feedparser/feed.rb
11
12
  lib/feedparser/generator.rb
@@ -13,57 +14,8 @@ lib/feedparser/item.rb
13
14
  lib/feedparser/parser.rb
14
15
  lib/feedparser/tag.rb
15
16
  lib/feedparser/version.rb
16
- test/feeds/books/nostarch.rss
17
- test/feeds/books/oreilly.feedburner.atom
18
- test/feeds/books/pragprog.rss
19
- test/feeds/byparker.json
20
- test/feeds/daringfireball.atom
21
- test/feeds/daringfireball.json
22
- test/feeds/googlegroups.atom
23
- test/feeds/googlegroups2.atom
24
- test/feeds/headius.atom
25
- test/feeds/inessential.json
26
- test/feeds/intertwingly.atom
27
- test/feeds/jsonfeed.json
28
- test/feeds/lambdatheultimate.rss
29
- test/feeds/learnenough.feedburner.atom
30
- test/feeds/news/nytimes-blogs-bits.rss
31
- test/feeds/news/nytimes-paul-krugman.rss
32
- test/feeds/news/nytimes-tech.rss
33
- test/feeds/news/nytimes-thomas-l-friedman.rss
34
- test/feeds/news/nytimes.rss
35
- test/feeds/news/washingtonpost-blogs-innovations.rss
36
- test/feeds/news/washingtonpost-politics.rss
37
- test/feeds/news/washingtonpost-world.rss
38
- test/feeds/ongoing.atom
39
- test/feeds/osm/blog.openstreetmap.rss
40
- test/feeds/osm/blogs.openstreetmap.rss
41
- test/feeds/osm/mapbox.rss
42
- test/feeds/railstutorial.feedburner.atom
43
- test/feeds/rubyflow.feedburner.rss
44
- test/feeds/rubymine.feedburner.rss
45
- test/feeds/rubyonrails.atom
46
- test/feeds/scripting.rss
47
- test/feeds/sitepoint.rss
48
- test/feeds/spec/atom/author.atom
49
- test/feeds/spec/atom/authors.atom
50
- test/feeds/spec/atom/categories.atom
51
- test/feeds/spec/json/example.json
52
- test/feeds/spec/json/microblog.json
53
- test/feeds/spec/json/tags.json
54
- test/feeds/spec/rss/author.rss
55
- test/feeds/spec/rss/categories.rss
56
- test/feeds/spec/rss/creator.rss
57
- test/feeds/xkcd.atom
58
- test/feeds/xkcd.rss
59
17
  test/helper.rb
60
- test/test_atom.rb
61
18
  test/test_atom_live.rb
62
- test/test_authors.rb
63
- test/test_books.rb
64
19
  test/test_dates.rb
65
- test/test_feeds.rb
66
- test/test_json.rb
67
- test/test_rss.rb
20
+ test/test_microformats.rb
68
21
  test/test_rss_live.rb
69
- test/test_tags.rb
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # feedparser
2
2
 
3
- feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON, etc.)
3
+ feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON Feed, HTML h-entry, etc.)
4
4
 
5
5
  * home :: [github.com/feedparser/feedparser](https://github.com/feedparser/feedparser)
6
6
  * bugs :: [github.com/feedparser/feedparser/issues](https://github.com/feedparser/feedparser/issues)
@@ -11,11 +11,19 @@ feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON, etc.)
11
11
 
12
12
  ## What's News?
13
13
 
14
- May/2017: Added support for reading feeds in the new [JSON Feed](https://jsonfeed.org) format in - surprise, surprise - JSON.
14
+ **June/2017**: Added support for reading feeds in HTML with Microformats incl.
15
+ [`h-entry`](http://microformats.org/wiki/h-entry),
16
+ [`h-feed`](http://microformats.org/wiki/h-feed) and others.
17
+
18
+ All feed with test assertions for easy (re)use and browsing moved
19
+ to its own repo, that is, [`/feeds`](https://github.com/feedparser/feeds).
20
+
21
+ **May/2017**: Added support for reading feeds in the new [JSON Feed](https://jsonfeed.org) format in - surprise, surprise - JSON.
22
+
15
23
 
16
24
  ## What's a Web Feed?
17
25
 
18
- See the [Awesome Feeds](https://github.com/statictimes/awesome-feeds) page ».
26
+ See the [Awesome Feeds](https://github.com/feedparser/awesome-feeds) page ».
19
27
 
20
28
 
21
29
 
@@ -62,7 +70,7 @@ RFC-822 date format e.g. Wed, 14 Jan 2015 19:48:57 +0100
62
70
  ISO-801 date format e.g. 2015-01-11T09:30:16Z
63
71
 
64
72
 
65
- ```
73
+ ``` ruby
66
74
  class Feed
67
75
  attr_accessor :format # e.g. atom|rss 2.0|etc.
68
76
  attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities unescaped
@@ -75,9 +83,9 @@ class Feed
75
83
  attr_accessor :updated # note: is lastBuildDate in RSS 2.0
76
84
  attr_accessor :published # note: is pubDate in RSS 2.0; not available in Atom
77
85
 
86
+ attr_accessor :authors
87
+ attr_accessor :tags
78
88
  attr_accessor :generator
79
- attr_accessor :generator_version # e.g. @version (atom)
80
- attr_accessor :generator_uri # e.g. @uri (atom) - use alias url/link ???
81
89
  end
82
90
  ```
83
91
 
@@ -121,7 +129,7 @@ Note: The content element will assume html content.
121
129
 
122
130
  Note: In plain vanilla RSS 2.0 there's only one `pubDate` for items, thus, it's not possible to differeniate between published and updated dates for items; note - the `item.pubDate` will get mapped to `item.updated`. To set the published date in RSS 2.0 use the dublin core module e.g `dc:created`, for example.
123
131
 
124
- ```
132
+ ``` ruby
125
133
  class Item
126
134
  attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities
127
135
  attr_accessor :url
@@ -158,7 +166,7 @@ end
158
166
 
159
167
  ### Read Feed Example
160
168
 
161
- ```
169
+ ``` ruby
162
170
  require 'open-uri'
163
171
  require 'feedparser'
164
172
 
@@ -190,7 +198,7 @@ puts feed.items[0].content
190
198
  or reading a feed in the new [JSON Feed](https://jsonfeed.org) format in - surprise, surprise - JSON;
191
199
  note: nothing changes :-)
192
200
 
193
- ```
201
+ ``` ruby
194
202
  txt = open( 'http://openfootball.github.io/feed.json' ).read
195
203
 
196
204
  feed = FeedParser::Parser.parse( txt )
@@ -216,6 +224,60 @@ puts feed.items[0].content_text
216
224
  ...
217
225
  ```
218
226
 
227
+ ### Microformats
228
+
229
+ Microformats let you mark up feeds and posts in HTML with
230
+ [`h-entry`](http://microformats.org/wiki/h-entry),
231
+ [`h-feed`](http://microformats.org/wiki/h-feed),
232
+ and friends.
233
+
234
+ Note: Microformats support in feedparser is optional.
235
+ Install and require the the [microformats gem](https://github.com/indieweb/microformats-ruby) to read
236
+ feeds in HTML with Microformats.
237
+
238
+
239
+ ``` ruby
240
+
241
+ require 'microformats'
242
+
243
+ text =<<HTML
244
+ <article class="h-entry">
245
+ <h1 class="p-name">Microformats are amazing</h1>
246
+ <p>Published by
247
+ <a class="p-author h-card" href="http://example.com">W. Developer</a>
248
+ on <time class="dt-published" datetime="2013-06-13 12:00:00">13<sup>th</sup> June 2013</time>
249
+
250
+ <p class="p-summary">In which I extoll the virtues of using microformats.</p>
251
+
252
+ <div class="e-content">
253
+ <p>Blah blah blah</p>
254
+ </div>
255
+ </article>
256
+ HTML
257
+
258
+ feed = FeedParser::Parser.parse( text )
259
+
260
+ puts feed.format
261
+ # => "html"
262
+ puts feed.items.size
263
+ # => 1
264
+ puts feed.items[0].authors.size
265
+ # => 1
266
+ puts feed.items[0].content_html
267
+ # => "<p>Blah blah blah</p>"
268
+ puts feed.items[0].content_text
269
+ # => "Blah blah blah"
270
+ puts feed.items[0].title
271
+ # => "Microformats are amazing"
272
+ puts feed.items[0].summary
273
+ # => "In which I extoll the virtues of using microformats."
274
+ puts feed.items[0].published
275
+ # => 2013-06-13 12:00:00
276
+ puts feed.items[0].authors[0].name
277
+ # => "W. Developer"
278
+ ...
279
+ ```
280
+
219
281
  ## Samples
220
282
 
221
283
  ### Feed Reader
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'feedparser' do
5
5
 
6
6
  self.version = FeedParser::VERSION
7
7
 
8
- self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON, etc.)'
8
+ self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON Feed, HTML h-entry, etc.)'
9
9
  self.description = summary
10
10
 
11
11
  self.urls = ['https://github.com/feedparser/feedparser']
@@ -23,6 +23,8 @@ require 'feedparser/version' # let it always go first
23
23
  require 'feedparser/builder/atom'
24
24
  require 'feedparser/builder/rss'
25
25
  require 'feedparser/builder/json'
26
+ require 'feedparser/builder/microformats'
27
+
26
28
 
27
29
  require 'feedparser/feed'
28
30
  require 'feedparser/item'
@@ -0,0 +1,264 @@
1
+ # encoding: utf-8
2
+
3
+ module FeedParser
4
+
5
+
6
+ class HyFeedBuilder
7
+
8
+ include LogUtils::Logging
9
+
10
+
11
+ def self.build( hash )
12
+ feed = self.new( hash )
13
+ feed.to_feed
14
+ end
15
+
16
+ def initialize( hash )
17
+ @feed = build_feed( hash )
18
+ end
19
+
20
+ def to_feed
21
+ @feed
22
+ end
23
+
24
+
25
+ def build_feed( h )
26
+
27
+ b = HyBuilder.new( h ) ## convert hash to structs
28
+
29
+ ## use first feed - more really possible?
30
+ ## fix/todo: handle no feed too!!!
31
+ hy = b.feeds[0]
32
+
33
+ ## pp hy
34
+
35
+ feed = Feed.new
36
+ feed.format = 'html'
37
+
38
+ ### todo: add
39
+ ## - feed.title
40
+ ## - feed.url
41
+ ## - feed.feed_url
42
+ ## - feed.summary
43
+ ## - feed.authors
44
+ ## etc.
45
+
46
+ hy.entries.each do |entry|
47
+ feed.items << build_item( entry )
48
+ end
49
+
50
+ feed # return new feed
51
+ end # method build_feed
52
+
53
+
54
+ def build_author( hy )
55
+ author = Author.new
56
+
57
+ author.name = hy.name
58
+
59
+ ## todo - add:
60
+ ## author.url
61
+
62
+ author
63
+ end
64
+
65
+
66
+
67
+ def build_item( hy )
68
+ item = Item.new # Item.new
69
+
70
+ item.title = hy.name
71
+ item.url = hy.url
72
+ item.published_local = hy.published_local
73
+ item.published = hy.published
74
+
75
+ item.content_html = hy.content_html
76
+ item.content_text = hy.content_text
77
+ item.summary = hy.summary
78
+
79
+ ## check: how to add an id - auto-generate - why? why not??
80
+ ## item.id = h['id']
81
+
82
+ hy.authors.each do |author|
83
+ item.authors << build_author( author )
84
+ end
85
+
86
+ item
87
+ end # method build_item
88
+
89
+ end # class HyFeedBuilder
90
+
91
+
92
+
93
+ class HyFeed
94
+ attr_accessor :entries
95
+
96
+ def initialize
97
+ @entries = []
98
+ end
99
+ end # class HyFeed
100
+
101
+
102
+ class HyEntry
103
+ attr_accessor :name
104
+ attr_accessor :content
105
+ attr_accessor :content_text
106
+ attr_accessor :summary
107
+
108
+ attr_accessor :published # utc time
109
+ attr_accessor :published_local # local time (with timezone/offset)
110
+ attr_accessor :url
111
+
112
+ attr_accessor :authors # note: allow multiple authors
113
+
114
+ # note: title is an alias for name
115
+ alias :title :name
116
+ alias :title= :name=
117
+
118
+ # note: content_html is an alias for name
119
+ alias :content_html :content
120
+ alias :content_html= :content=
121
+
122
+ def initialize
123
+ @authors = []
124
+ end
125
+
126
+ end ## class HyEntry
127
+
128
+
129
+ class HyAuthor
130
+ attr_accessor :name
131
+ attr_accessor :url
132
+ end ## class HyAuthor
133
+
134
+
135
+
136
+
137
+ class HyBuilder
138
+
139
+ attr_reader :feeds
140
+
141
+ def initialize( hash )
142
+ @h = hash
143
+ @feeds = []
144
+ build
145
+
146
+ pp @feeds
147
+ end
148
+
149
+ def build
150
+
151
+ entries = []
152
+ @h['items'].each_with_index do |item_hash,i|
153
+ puts "item #{i+1}:"
154
+ pp item_hash
155
+
156
+ types = item_hash['type']
157
+ pp types
158
+ if types.include?( 'h-feed' )
159
+ @feeds << build_feed( item_hash )
160
+ elsif types.include?( 'h-entry' )
161
+ entries << build_entry( item_hash )
162
+ else
163
+ ## unknown type; skip for now
164
+ end
165
+ end
166
+
167
+ ## wrap all "loose" entries in a "dummy" h-entry feed
168
+ if entries.any?
169
+ feed = HyFeed.new
170
+ feed.entries = entries
171
+ @feeds << feed
172
+ end
173
+
174
+ end # method build
175
+
176
+ def build_feed( h )
177
+ puts " build_feed"
178
+
179
+ feed = HyFeed.new
180
+
181
+ h['children'].each_with_index do |item_hash,i|
182
+ puts "item #{i+1}:"
183
+ pp item_hash
184
+
185
+ types = item_hash['type']
186
+ pp types
187
+ if types.include?( 'h-entry' )
188
+ feed.entries << build_entry( item_hash )
189
+ else
190
+ ## unknown type; skip for now
191
+ end
192
+ end
193
+
194
+ feed
195
+ end ## method build_feed
196
+
197
+
198
+ def build_entry( h )
199
+ puts " build_entry"
200
+
201
+ entry = HyEntry.new
202
+
203
+ props = h['properties']
204
+ pp props
205
+
206
+ entry.name = props['name'].join( ' ') # check an example with more entries (how to join??)
207
+
208
+ if props['summary']
209
+ entry.summary = props['summary'].join( ' ' )
210
+ end
211
+
212
+ if props['content']
213
+ ## add up all value attribs in content
214
+ entry.content_text = props['content'].map { |h| h[:value] }.join( ' ' ).strip
215
+ ## add up all html attribs in content; plus strip leading n trailing whitespaces
216
+ entry.content = props['content'].map { |h| h[:html] }.join( ' ' ).strip
217
+ end
218
+
219
+
220
+ # get first field in array -- check if really ever possible more than one? what does it mean (many dates)???
221
+ ## todo: check if datetime is always utc (or local possible?)
222
+ url_str = props.fetch( 'url', [] )[0]
223
+ if url_str
224
+ entry.url = url_str
225
+ end
226
+
227
+ # get first field in array -- check if really ever possible more than one? what does it mean (many dates)???
228
+ ## todo: check if datetime is always utc (or local possible?)
229
+ published_str = props.fetch( 'published', [] )[0]
230
+ pp published_str
231
+ if published_str
232
+ ## entry.published = DateTime.iso8601( published_str )
233
+ entry.published_local = DateTime.parse( published_str )
234
+ entry.published = entry.published_local.utc
235
+ end
236
+
237
+ ## check for authors
238
+ if props['author']
239
+ props['author'].each do |author_hash|
240
+ pp author_hash
241
+ entry.authors << build_author( author_hash )
242
+ end
243
+ end
244
+
245
+ entry
246
+ end # method build_entry
247
+
248
+ def build_author( h )
249
+ puts " build_author"
250
+
251
+ author = HyAuthor.new
252
+
253
+ author.name = h['value']
254
+
255
+ ## todo/fix: -- note: for now skip possible embedded h-card
256
+ author
257
+ end # method build_author
258
+
259
+
260
+ end # class HyBuilder
261
+
262
+
263
+
264
+ end # module FeedParser