feedparser 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/Manifest.txt +2 -50
  3. data/README.md +71 -9
  4. data/Rakefile +1 -1
  5. data/lib/feedparser.rb +2 -0
  6. data/lib/feedparser/builder/microformats.rb +264 -0
  7. data/lib/feedparser/parser.rb +27 -0
  8. data/lib/feedparser/version.rb +2 -2
  9. data/test/helper.rb +3 -57
  10. data/test/test_microformats.rb +52 -0
  11. metadata +10 -56
  12. data/test/feeds/books/nostarch.rss +0 -125
  13. data/test/feeds/books/oreilly.feedburner.atom +0 -387
  14. data/test/feeds/books/pragprog.rss +0 -148
  15. data/test/feeds/byparker.json +0 -643
  16. data/test/feeds/daringfireball.atom +0 -1873
  17. data/test/feeds/daringfireball.json +0 -619
  18. data/test/feeds/googlegroups.atom +0 -37
  19. data/test/feeds/googlegroups2.atom +0 -27
  20. data/test/feeds/headius.atom +0 -123
  21. data/test/feeds/inessential.json +0 -182
  22. data/test/feeds/intertwingly.atom +0 -1197
  23. data/test/feeds/jsonfeed.json +0 -37
  24. data/test/feeds/lambdatheultimate.rss +0 -288
  25. data/test/feeds/learnenough.feedburner.atom +0 -747
  26. data/test/feeds/news/nytimes-blogs-bits.rss +0 -333
  27. data/test/feeds/news/nytimes-paul-krugman.rss +0 -60
  28. data/test/feeds/news/nytimes-tech.rss +0 -653
  29. data/test/feeds/news/nytimes-thomas-l-friedman.rss +0 -80
  30. data/test/feeds/news/nytimes.rss +0 -607
  31. data/test/feeds/news/washingtonpost-blogs-innovations.rss +0 -183
  32. data/test/feeds/news/washingtonpost-politics.rss +0 -35
  33. data/test/feeds/news/washingtonpost-world.rss +0 -29
  34. data/test/feeds/ongoing.atom +0 -1619
  35. data/test/feeds/osm/blog.openstreetmap.rss +0 -252
  36. data/test/feeds/osm/blogs.openstreetmap.rss +0 -585
  37. data/test/feeds/osm/mapbox.rss +0 -1883
  38. data/test/feeds/railstutorial.feedburner.atom +0 -656
  39. data/test/feeds/rubyflow.feedburner.rss +0 -120
  40. data/test/feeds/rubymine.feedburner.rss +0 -314
  41. data/test/feeds/rubyonrails.atom +0 -1241
  42. data/test/feeds/scripting.rss +0 -881
  43. data/test/feeds/sitepoint.rss +0 -218
  44. data/test/feeds/spec/atom/author.atom +0 -48
  45. data/test/feeds/spec/atom/authors.atom +0 -70
  46. data/test/feeds/spec/atom/categories.atom +0 -66
  47. data/test/feeds/spec/json/example.json +0 -36
  48. data/test/feeds/spec/json/microblog.json +0 -43
  49. data/test/feeds/spec/json/tags.json +0 -33
  50. data/test/feeds/spec/rss/author.rss +0 -41
  51. data/test/feeds/spec/rss/categories.rss +0 -64
  52. data/test/feeds/spec/rss/creator.rss +0 -38
  53. data/test/feeds/xkcd.atom +0 -48
  54. data/test/feeds/xkcd.rss +0 -55
  55. data/test/test_atom.rb +0 -27
  56. data/test/test_authors.rb +0 -26
  57. data/test/test_books.rb +0 -25
  58. data/test/test_feeds.rb +0 -29
  59. data/test/test_json.rb +0 -27
  60. data/test/test_rss.rb +0 -26
  61. data/test/test_tags.rb +0 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 520ebca7dbff20c3347536869aa6dcfd87d68ddc
4
- data.tar.gz: 5378e9fc3d8038ed61d085999fcb7d1ab3e1f041
3
+ metadata.gz: 12a89b6c4d0ad5290a16d44a79245a25bb6ff349
4
+ data.tar.gz: 63057209038f7cab23eb4fd963d9525b154d2fbb
5
5
  SHA512:
6
- metadata.gz: 40cc695312a0007da9e569fd05dd35d6621e08367248191eea645c77a93ed5e64c58c0febd9b9d5762177751b3e9c712adbcd52c1ec951e9125c085719f37734
7
- data.tar.gz: 96fb8f2c92a83447cc5598b667a3a5036d66c93305b3a624a9f6081e4489b43f5bf37b51cc61959f0ad1a621bd6d1c456a514a1c501e7112179f00d6f1c55cfb
6
+ metadata.gz: 6220dd036c705fedb52a17495001003c88891a912c3ad9e3fed07045adde3f0902d8dcbf3fc7000fbc297b91bba388979c54fbdaf1890f20e541d9216bfc5438
7
+ data.tar.gz: 049a3b9cdf4b27fe1fbc2e9109241ed951e4a717e6f620bd6aa7b2b17c9657e1fe468b86fd52cd1bf5ff48192935b3939d6579ca612c5ac576553c2f9bfcb433
@@ -6,6 +6,7 @@ lib/feedparser.rb
6
6
  lib/feedparser/author.rb
7
7
  lib/feedparser/builder/atom.rb
8
8
  lib/feedparser/builder/json.rb
9
+ lib/feedparser/builder/microformats.rb
9
10
  lib/feedparser/builder/rss.rb
10
11
  lib/feedparser/feed.rb
11
12
  lib/feedparser/generator.rb
@@ -13,57 +14,8 @@ lib/feedparser/item.rb
13
14
  lib/feedparser/parser.rb
14
15
  lib/feedparser/tag.rb
15
16
  lib/feedparser/version.rb
16
- test/feeds/books/nostarch.rss
17
- test/feeds/books/oreilly.feedburner.atom
18
- test/feeds/books/pragprog.rss
19
- test/feeds/byparker.json
20
- test/feeds/daringfireball.atom
21
- test/feeds/daringfireball.json
22
- test/feeds/googlegroups.atom
23
- test/feeds/googlegroups2.atom
24
- test/feeds/headius.atom
25
- test/feeds/inessential.json
26
- test/feeds/intertwingly.atom
27
- test/feeds/jsonfeed.json
28
- test/feeds/lambdatheultimate.rss
29
- test/feeds/learnenough.feedburner.atom
30
- test/feeds/news/nytimes-blogs-bits.rss
31
- test/feeds/news/nytimes-paul-krugman.rss
32
- test/feeds/news/nytimes-tech.rss
33
- test/feeds/news/nytimes-thomas-l-friedman.rss
34
- test/feeds/news/nytimes.rss
35
- test/feeds/news/washingtonpost-blogs-innovations.rss
36
- test/feeds/news/washingtonpost-politics.rss
37
- test/feeds/news/washingtonpost-world.rss
38
- test/feeds/ongoing.atom
39
- test/feeds/osm/blog.openstreetmap.rss
40
- test/feeds/osm/blogs.openstreetmap.rss
41
- test/feeds/osm/mapbox.rss
42
- test/feeds/railstutorial.feedburner.atom
43
- test/feeds/rubyflow.feedburner.rss
44
- test/feeds/rubymine.feedburner.rss
45
- test/feeds/rubyonrails.atom
46
- test/feeds/scripting.rss
47
- test/feeds/sitepoint.rss
48
- test/feeds/spec/atom/author.atom
49
- test/feeds/spec/atom/authors.atom
50
- test/feeds/spec/atom/categories.atom
51
- test/feeds/spec/json/example.json
52
- test/feeds/spec/json/microblog.json
53
- test/feeds/spec/json/tags.json
54
- test/feeds/spec/rss/author.rss
55
- test/feeds/spec/rss/categories.rss
56
- test/feeds/spec/rss/creator.rss
57
- test/feeds/xkcd.atom
58
- test/feeds/xkcd.rss
59
17
  test/helper.rb
60
- test/test_atom.rb
61
18
  test/test_atom_live.rb
62
- test/test_authors.rb
63
- test/test_books.rb
64
19
  test/test_dates.rb
65
- test/test_feeds.rb
66
- test/test_json.rb
67
- test/test_rss.rb
20
+ test/test_microformats.rb
68
21
  test/test_rss_live.rb
69
- test/test_tags.rb
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # feedparser
2
2
 
3
- feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON, etc.)
3
+ feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON Feed, HTML h-entry, etc.)
4
4
 
5
5
  * home :: [github.com/feedparser/feedparser](https://github.com/feedparser/feedparser)
6
6
  * bugs :: [github.com/feedparser/feedparser/issues](https://github.com/feedparser/feedparser/issues)
@@ -11,11 +11,19 @@ feedparser gem - web feed parser and normalizer (Atom, RSS 2.0, JSON, etc.)
11
11
 
12
12
  ## What's News?
13
13
 
14
- May/2017: Added support for reading feeds in the new [JSON Feed](https://jsonfeed.org) format in - surprise, surprise - JSON.
14
+ **June/2017**: Added support for reading feeds in HTML with Microformats incl.
15
+ [`h-entry`](http://microformats.org/wiki/h-entry),
16
+ [`h-feed`](http://microformats.org/wiki/h-feed) and others.
17
+
18
+ All feed with test assertions for easy (re)use and browsing moved
19
+ to its own repo, that is, [`/feeds`](https://github.com/feedparser/feeds).
20
+
21
+ **May/2017**: Added support for reading feeds in the new [JSON Feed](https://jsonfeed.org) format in - surprise, surprise - JSON.
22
+
15
23
 
16
24
  ## What's a Web Feed?
17
25
 
18
- See the [Awesome Feeds](https://github.com/statictimes/awesome-feeds) page ».
26
+ See the [Awesome Feeds](https://github.com/feedparser/awesome-feeds) page ».
19
27
 
20
28
 
21
29
 
@@ -62,7 +70,7 @@ RFC-822 date format e.g. Wed, 14 Jan 2015 19:48:57 +0100
62
70
  ISO-801 date format e.g. 2015-01-11T09:30:16Z
63
71
 
64
72
 
65
- ```
73
+ ``` ruby
66
74
  class Feed
67
75
  attr_accessor :format # e.g. atom|rss 2.0|etc.
68
76
  attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities unescaped
@@ -75,9 +83,9 @@ class Feed
75
83
  attr_accessor :updated # note: is lastBuildDate in RSS 2.0
76
84
  attr_accessor :published # note: is pubDate in RSS 2.0; not available in Atom
77
85
 
86
+ attr_accessor :authors
87
+ attr_accessor :tags
78
88
  attr_accessor :generator
79
- attr_accessor :generator_version # e.g. @version (atom)
80
- attr_accessor :generator_uri # e.g. @uri (atom) - use alias url/link ???
81
89
  end
82
90
  ```
83
91
 
@@ -121,7 +129,7 @@ Note: The content element will assume html content.
121
129
 
122
130
  Note: In plain vanilla RSS 2.0 there's only one `pubDate` for items, thus, it's not possible to differeniate between published and updated dates for items; note - the `item.pubDate` will get mapped to `item.updated`. To set the published date in RSS 2.0 use the dublin core module e.g `dc:created`, for example.
123
131
 
124
- ```
132
+ ``` ruby
125
133
  class Item
126
134
  attr_accessor :title # note: always plain vanilla text - if present html tags will get stripped and html entities
127
135
  attr_accessor :url
@@ -158,7 +166,7 @@ end
158
166
 
159
167
  ### Read Feed Example
160
168
 
161
- ```
169
+ ``` ruby
162
170
  require 'open-uri'
163
171
  require 'feedparser'
164
172
 
@@ -190,7 +198,7 @@ puts feed.items[0].content
190
198
  or reading a feed in the new [JSON Feed](https://jsonfeed.org) format in - surprise, surprise - JSON;
191
199
  note: nothing changes :-)
192
200
 
193
- ```
201
+ ``` ruby
194
202
  txt = open( 'http://openfootball.github.io/feed.json' ).read
195
203
 
196
204
  feed = FeedParser::Parser.parse( txt )
@@ -216,6 +224,60 @@ puts feed.items[0].content_text
216
224
  ...
217
225
  ```
218
226
 
227
+ ### Microformats
228
+
229
+ Microformats let you mark up feeds and posts in HTML with
230
+ [`h-entry`](http://microformats.org/wiki/h-entry),
231
+ [`h-feed`](http://microformats.org/wiki/h-feed),
232
+ and friends.
233
+
234
+ Note: Microformats support in feedparser is optional.
235
+ Install and require the the [microformats gem](https://github.com/indieweb/microformats-ruby) to read
236
+ feeds in HTML with Microformats.
237
+
238
+
239
+ ``` ruby
240
+
241
+ require 'microformats'
242
+
243
+ text =<<HTML
244
+ <article class="h-entry">
245
+ <h1 class="p-name">Microformats are amazing</h1>
246
+ <p>Published by
247
+ <a class="p-author h-card" href="http://example.com">W. Developer</a>
248
+ on <time class="dt-published" datetime="2013-06-13 12:00:00">13<sup>th</sup> June 2013</time>
249
+
250
+ <p class="p-summary">In which I extoll the virtues of using microformats.</p>
251
+
252
+ <div class="e-content">
253
+ <p>Blah blah blah</p>
254
+ </div>
255
+ </article>
256
+ HTML
257
+
258
+ feed = FeedParser::Parser.parse( text )
259
+
260
+ puts feed.format
261
+ # => "html"
262
+ puts feed.items.size
263
+ # => 1
264
+ puts feed.items[0].authors.size
265
+ # => 1
266
+ puts feed.items[0].content_html
267
+ # => "<p>Blah blah blah</p>"
268
+ puts feed.items[0].content_text
269
+ # => "Blah blah blah"
270
+ puts feed.items[0].title
271
+ # => "Microformats are amazing"
272
+ puts feed.items[0].summary
273
+ # => "In which I extoll the virtues of using microformats."
274
+ puts feed.items[0].published
275
+ # => 2013-06-13 12:00:00
276
+ puts feed.items[0].authors[0].name
277
+ # => "W. Developer"
278
+ ...
279
+ ```
280
+
219
281
  ## Samples
220
282
 
221
283
  ### Feed Reader
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'feedparser' do
5
5
 
6
6
  self.version = FeedParser::VERSION
7
7
 
8
- self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON, etc.)'
8
+ self.summary = 'feedparser - web feed parser and normalizer (RSS 2.0, Atom, JSON Feed, HTML h-entry, etc.)'
9
9
  self.description = summary
10
10
 
11
11
  self.urls = ['https://github.com/feedparser/feedparser']
@@ -23,6 +23,8 @@ require 'feedparser/version' # let it always go first
23
23
  require 'feedparser/builder/atom'
24
24
  require 'feedparser/builder/rss'
25
25
  require 'feedparser/builder/json'
26
+ require 'feedparser/builder/microformats'
27
+
26
28
 
27
29
  require 'feedparser/feed'
28
30
  require 'feedparser/item'
@@ -0,0 +1,264 @@
1
+ # encoding: utf-8
2
+
3
+ module FeedParser
4
+
5
+
6
+ class HyFeedBuilder
7
+
8
+ include LogUtils::Logging
9
+
10
+
11
+ def self.build( hash )
12
+ feed = self.new( hash )
13
+ feed.to_feed
14
+ end
15
+
16
+ def initialize( hash )
17
+ @feed = build_feed( hash )
18
+ end
19
+
20
+ def to_feed
21
+ @feed
22
+ end
23
+
24
+
25
+ def build_feed( h )
26
+
27
+ b = HyBuilder.new( h ) ## convert hash to structs
28
+
29
+ ## use first feed - more really possible?
30
+ ## fix/todo: handle no feed too!!!
31
+ hy = b.feeds[0]
32
+
33
+ ## pp hy
34
+
35
+ feed = Feed.new
36
+ feed.format = 'html'
37
+
38
+ ### todo: add
39
+ ## - feed.title
40
+ ## - feed.url
41
+ ## - feed.feed_url
42
+ ## - feed.summary
43
+ ## - feed.authors
44
+ ## etc.
45
+
46
+ hy.entries.each do |entry|
47
+ feed.items << build_item( entry )
48
+ end
49
+
50
+ feed # return new feed
51
+ end # method build_feed
52
+
53
+
54
+ def build_author( hy )
55
+ author = Author.new
56
+
57
+ author.name = hy.name
58
+
59
+ ## todo - add:
60
+ ## author.url
61
+
62
+ author
63
+ end
64
+
65
+
66
+
67
+ def build_item( hy )
68
+ item = Item.new # Item.new
69
+
70
+ item.title = hy.name
71
+ item.url = hy.url
72
+ item.published_local = hy.published_local
73
+ item.published = hy.published
74
+
75
+ item.content_html = hy.content_html
76
+ item.content_text = hy.content_text
77
+ item.summary = hy.summary
78
+
79
+ ## check: how to add an id - auto-generate - why? why not??
80
+ ## item.id = h['id']
81
+
82
+ hy.authors.each do |author|
83
+ item.authors << build_author( author )
84
+ end
85
+
86
+ item
87
+ end # method build_item
88
+
89
+ end # class HyFeedBuilder
90
+
91
+
92
+
93
+ class HyFeed
94
+ attr_accessor :entries
95
+
96
+ def initialize
97
+ @entries = []
98
+ end
99
+ end # class HyFeed
100
+
101
+
102
+ class HyEntry
103
+ attr_accessor :name
104
+ attr_accessor :content
105
+ attr_accessor :content_text
106
+ attr_accessor :summary
107
+
108
+ attr_accessor :published # utc time
109
+ attr_accessor :published_local # local time (with timezone/offset)
110
+ attr_accessor :url
111
+
112
+ attr_accessor :authors # note: allow multiple authors
113
+
114
+ # note: title is an alias for name
115
+ alias :title :name
116
+ alias :title= :name=
117
+
118
+ # note: content_html is an alias for name
119
+ alias :content_html :content
120
+ alias :content_html= :content=
121
+
122
+ def initialize
123
+ @authors = []
124
+ end
125
+
126
+ end ## class HyEntry
127
+
128
+
129
+ class HyAuthor
130
+ attr_accessor :name
131
+ attr_accessor :url
132
+ end ## class HyAuthor
133
+
134
+
135
+
136
+
137
+ class HyBuilder
138
+
139
+ attr_reader :feeds
140
+
141
+ def initialize( hash )
142
+ @h = hash
143
+ @feeds = []
144
+ build
145
+
146
+ pp @feeds
147
+ end
148
+
149
+ def build
150
+
151
+ entries = []
152
+ @h['items'].each_with_index do |item_hash,i|
153
+ puts "item #{i+1}:"
154
+ pp item_hash
155
+
156
+ types = item_hash['type']
157
+ pp types
158
+ if types.include?( 'h-feed' )
159
+ @feeds << build_feed( item_hash )
160
+ elsif types.include?( 'h-entry' )
161
+ entries << build_entry( item_hash )
162
+ else
163
+ ## unknown type; skip for now
164
+ end
165
+ end
166
+
167
+ ## wrap all "loose" entries in a "dummy" h-entry feed
168
+ if entries.any?
169
+ feed = HyFeed.new
170
+ feed.entries = entries
171
+ @feeds << feed
172
+ end
173
+
174
+ end # method build
175
+
176
+ def build_feed( h )
177
+ puts " build_feed"
178
+
179
+ feed = HyFeed.new
180
+
181
+ h['children'].each_with_index do |item_hash,i|
182
+ puts "item #{i+1}:"
183
+ pp item_hash
184
+
185
+ types = item_hash['type']
186
+ pp types
187
+ if types.include?( 'h-entry' )
188
+ feed.entries << build_entry( item_hash )
189
+ else
190
+ ## unknown type; skip for now
191
+ end
192
+ end
193
+
194
+ feed
195
+ end ## method build_feed
196
+
197
+
198
+ def build_entry( h )
199
+ puts " build_entry"
200
+
201
+ entry = HyEntry.new
202
+
203
+ props = h['properties']
204
+ pp props
205
+
206
+ entry.name = props['name'].join( ' ') # check an example with more entries (how to join??)
207
+
208
+ if props['summary']
209
+ entry.summary = props['summary'].join( ' ' )
210
+ end
211
+
212
+ if props['content']
213
+ ## add up all value attribs in content
214
+ entry.content_text = props['content'].map { |h| h[:value] }.join( ' ' ).strip
215
+ ## add up all html attribs in content; plus strip leading n trailing whitespaces
216
+ entry.content = props['content'].map { |h| h[:html] }.join( ' ' ).strip
217
+ end
218
+
219
+
220
+ # get first field in array -- check if really ever possible more than one? what does it mean (many dates)???
221
+ ## todo: check if datetime is always utc (or local possible?)
222
+ url_str = props.fetch( 'url', [] )[0]
223
+ if url_str
224
+ entry.url = url_str
225
+ end
226
+
227
+ # get first field in array -- check if really ever possible more than one? what does it mean (many dates)???
228
+ ## todo: check if datetime is always utc (or local possible?)
229
+ published_str = props.fetch( 'published', [] )[0]
230
+ pp published_str
231
+ if published_str
232
+ ## entry.published = DateTime.iso8601( published_str )
233
+ entry.published_local = DateTime.parse( published_str )
234
+ entry.published = entry.published_local.utc
235
+ end
236
+
237
+ ## check for authors
238
+ if props['author']
239
+ props['author'].each do |author_hash|
240
+ pp author_hash
241
+ entry.authors << build_author( author_hash )
242
+ end
243
+ end
244
+
245
+ entry
246
+ end # method build_entry
247
+
248
+ def build_author( h )
249
+ puts " build_author"
250
+
251
+ author = HyAuthor.new
252
+
253
+ author.name = h['value']
254
+
255
+ ## todo/fix: -- note: for now skip possible embedded h-card
256
+ author
257
+ end # method build_author
258
+
259
+
260
+ end # class HyBuilder
261
+
262
+
263
+
264
+ end # module FeedParser