feedtools 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ == FeedTools 0.2.2
2
+ * fixed http redirection bug
3
+ * fixed several documentation typos
4
+ * still more unit tests
5
+ * improved support for atom
6
+ * minor improvements to the database caching mechanism
7
+ * more complete support for rss elements
8
+ * major improvements to the handling of tags containing html content
1
9
  == FeedTools 0.2.1
2
10
  * fixed incorrect dependancy on ActiveRecord 1.10.1
3
11
  * more unit tests
data/lib/feed_tools.rb CHANGED
@@ -25,7 +25,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
25
25
  ENV['RAILS_ENV'] ||
26
26
  'production' # :nodoc:
27
27
 
28
- FEED_TOOLS_VERSION = "0.2.1"
28
+ FEED_TOOLS_VERSION = "0.2.2"
29
29
 
30
30
  $:.unshift(File.dirname(__FILE__))
31
31
  $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib")
@@ -131,6 +131,19 @@ module FeedTools
131
131
  end
132
132
  return nil
133
133
  end
134
+
135
+ # Returns true if a connection to the database has been established and the
136
+ # required table structure is in place.
137
+ def DatabaseFeedCache.connected?
138
+ begin
139
+ ActiveRecord::Base.connection
140
+ return false if ActiveRecord::Base.configurations.nil?
141
+ return false unless DatabaseFeedCache.table_exists?
142
+ rescue => error
143
+ return false
144
+ end
145
+ return true
146
+ end
134
147
 
135
148
  # True if the appropriate database table already exists
136
149
  def DatabaseFeedCache.table_exists?
@@ -258,12 +271,25 @@ module FeedTools
258
271
  # find_by_id
259
272
  # find_by_url
260
273
  # initialize_cache
274
+ # connected?
261
275
  def FeedTools.feed_cache=(new_feed_cache)
262
276
  # TODO: ensure that the feed cache class actually does those things.
263
277
  # ==================================================================
264
278
  @feed_cache = new_feed_cache
265
279
  end
266
280
 
281
+ # Returns true if FeedTools.feed_cache is not nil and a connection with
282
+ # the cache has been successfully established. Also returns false if an
283
+ # error is raised while trying to determine the status of the cache.
284
+ def FeedTools.feed_cache_connected?
285
+ begin
286
+ return false if FeedTools.feed_cache.nil?
287
+ return FeedTools.feed_cache.connected?
288
+ rescue
289
+ return false
290
+ end
291
+ end
292
+
267
293
  # Returns the currently used user agent string.
268
294
  def FeedTools.user_agent
269
295
  return @user_agent
@@ -455,6 +481,25 @@ module FeedTools
455
481
  return true
456
482
  end
457
483
 
484
+ # Escapes all html entities
485
+ def FeedTools.escape_entities(html)
486
+ escaped_html = CGI.escapeHTML(html)
487
+ unescaped_html.gsub!(/'/, "'")
488
+ unescaped_html.gsub!(/"/, """)
489
+ return escaped_html
490
+ end
491
+
492
+ # Unescapes all html entities
493
+ def FeedTools.unescape_entities(html)
494
+ unescaped_html = html
495
+ unescaped_html.gsub!(/&/, "&")
496
+ unescaped_html.gsub!(/&/, "&")
497
+ unescaped_html = CGI.unescapeHTML(unescaped_html)
498
+ unescaped_html.gsub!(/'/, "'")
499
+ unescaped_html.gsub!(/"/, "\"")
500
+ return unescaped_html
501
+ end
502
+
458
503
  # Removes all html tags from the html formatted text.
459
504
  def FeedTools.strip_html(html)
460
505
  # TODO: do this properly
@@ -467,6 +512,7 @@ module FeedTools
467
512
  def FeedTools.tidy_html(html)
468
513
  if FeedTools.tidy_enabled?
469
514
  is_fragment = true
515
+ html.gsub!(/<!'/, "<!'")
470
516
  if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
471
517
  (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
472
518
  is_fragment = false
@@ -484,13 +530,15 @@ module FeedTools
484
530
  xml
485
531
  end
486
532
  if is_fragment
487
- # Tidy puts <html>...<body>[our html]</body>...</html> in.
533
+ # Tidy sticks <html>...<body>[our html]</body>...</html> in.
488
534
  # We don't want this.
489
535
  tidy_html.strip!
490
536
  tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
491
537
  tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
492
538
  tidy_html.strip!
493
539
  end
540
+ tidy_html.gsub!(/&#x26;/, "&amp;")
541
+ tidy_html.gsub!(/&#38;/, "&amp;")
494
542
  else
495
543
  tidy_html = html
496
544
  end
@@ -502,7 +550,7 @@ module FeedTools
502
550
  # be escaped. If mode is set to :strip, dangerous and unknown
503
551
  # elements and all children will be removed entirely.
504
552
  # Dangerous or unknown attributes are always removed.
505
- def FeedTools.sanitize_html(html, mode=:escape)
553
+ def FeedTools.sanitize_html(html, mode=:strip)
506
554
 
507
555
  # Lists borrowed from Mark Pilgrim's feedparser
508
556
  acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -527,18 +575,14 @@ module FeedTools
527
575
  'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
528
576
  'type', 'usemap', 'valign', 'value', 'vspace', 'width']
529
577
 
530
- # Stupid hack to pass this unit test:
531
- # http://feedparser.org/tests/wellformed/rss/
532
- # item_description_not_a_doctype.xml
533
- html.gsub!(/<!'/, "&lt;!'")
534
-
535
- # The closer we are to proper xhtml, the more accurate the
536
- # sanitization will be.
537
- html = FeedTools.tidy_html(html)
538
-
578
+ # Replace with appropriate named entities
579
+ html.gsub!(/&#x26;/, "&amp;")
580
+ html.gsub!(/&#38;/, "&amp;")
581
+ html.gsub!(/&lt;!'/, "&amp;lt;!'")
582
+
539
583
  # Hackity hack. But it works, and it seems plenty fast enough.
540
584
  html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
541
-
585
+
542
586
  sanitize_node = lambda do |html_node|
543
587
  if html_node.respond_to? :children
544
588
  for child in html_node.children
@@ -564,13 +608,83 @@ module FeedTools
564
608
  html_node
565
609
  end
566
610
  sanitize_node.call(html_doc.root)
567
- return html_doc.root.inner_xml
611
+ html = html_doc.root.inner_xml
612
+ return html
568
613
  end
569
614
 
570
615
  class Feed
571
616
  include REXML
572
617
  include AttributeDictionary
573
618
 
619
+ # Represents a feed/feed item's category
620
+ class Category
621
+ # The category term value
622
+ attr_accessor :term
623
+ # The categorization scheme
624
+ attr_accessor :scheme
625
+ # A human-readable description of the category
626
+ attr_accessor :label
627
+
628
+ # Relays any unknown methods to the term so that you can treat the
629
+ # category object as a string
630
+ def method_missing(msg, *params)
631
+ self.term.send(msg, params)
632
+ end
633
+
634
+ # Relays the to_s method to the term field
635
+ def to_s
636
+ self.term.to_s
637
+ end
638
+
639
+ # Relays the inspect method to the term field
640
+ def inspect
641
+ self.term.inspect
642
+ end
643
+ end
644
+
645
+ # Represents a feed/feed item's author
646
+ class Author
647
+ # The author's real name
648
+ attr_accessor :name
649
+ # The author's email address
650
+ attr_accessor :email
651
+ # The url of the author's homepage
652
+ attr_accessor :url
653
+ # The raw value of the author tag if present
654
+ attr_accessor :raw
655
+
656
+ # Relays any unknown methods to the name so that you can treat the
657
+ # author object as a string
658
+ def method_missing(msg, *params)
659
+ self.name.send(msg, params)
660
+ end
661
+
662
+ # Relays the to_s method to the name field
663
+ def to_s
664
+ self.name.to_s
665
+ end
666
+
667
+ # Relays the inspect method to the name field
668
+ def inspect
669
+ self.name.inspect
670
+ end
671
+ end
672
+
673
+ # Represents a feed's cloud.
674
+ class Cloud
675
+ # The domain of the cloud.
676
+ attr_accessor :domain
677
+ # The path for the cloud.
678
+ attr_accessor :path
679
+ # The port the cloud is listening on.
680
+ attr_accessor :port
681
+ # The web services protocol the cloud uses.
682
+ # Possible values are either "xml-rpc" or "soap".
683
+ attr_accessor :protocol
684
+ # The procedure to use to request notification.
685
+ attr_accessor :register_procedure
686
+ end
687
+
574
688
  # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired.
575
689
  def Feed.open(url)
576
690
  # clean up the url
@@ -671,14 +785,14 @@ module FeedTools
671
785
 
672
786
  Net::HTTP.start(feed_uri.host, (feed_uri.port or 80)) do |http|
673
787
  response = http.request_get(feed_uri.path, http_headers)
674
-
788
+
675
789
  case response
676
790
  when Net::HTTPSuccess
677
791
  # We've reached the final destination, process all previous
678
792
  # redirections, and see if we need to update the url.
679
793
  for redirected_response in response_chain
680
794
  if redirected_response.last.code.to_i == 301
681
- self.url = redirected_response.first
795
+ self.url = redirected_response.last['location']
682
796
  else
683
797
  # Jump out as soon as we hit anything that isn't a
684
798
  # permanently moved redirection.
@@ -690,7 +804,7 @@ module FeedTools
690
804
  if response.code.to_i == 304
691
805
  response.error!
692
806
  else
693
- if response['Location'].nil?
807
+ if response['location'].nil?
694
808
  raise FeedAccessError,
695
809
  "No location to redirect to supplied: " + response.code
696
810
  end
@@ -913,6 +1027,30 @@ module FeedTools
913
1027
  @cache_object = new_cache_object
914
1028
  end
915
1029
 
1030
+ # Returns the type of feed
1031
+ # Possible values:
1032
+ # "rss", "atom", "cdf", "!okay/news"
1033
+ def feed_type
1034
+ if @feed_type.nil?
1035
+ case self.root_node.name.downcase
1036
+ when "feed"
1037
+ @feed_type = "atom"
1038
+ when "rdf:rdf"
1039
+ @feed_type = "rss"
1040
+ when "rss"
1041
+ @feed_type = "rss"
1042
+ when "channel"
1043
+ @feed_type = "cdf"
1044
+ end
1045
+ end
1046
+ return @feed_type
1047
+ end
1048
+
1049
+ # Sets the default feed type
1050
+ def feed_type=(new_feed_type)
1051
+ @feed_type = new_feed_type
1052
+ end
1053
+
916
1054
  # Returns the feed's unique id
917
1055
  def id
918
1056
  if @id.nil?
@@ -948,24 +1086,30 @@ module FeedTools
948
1086
  # Returns the feed title
949
1087
  def title
950
1088
  if @title.nil?
951
- if XPath.first(channel_node, "title/@type").to_s == "xhtml" ||
952
- XPath.first(channel_node, "title/@mode").to_s == "xhtml"
1089
+ repair_entities = false
1090
+ if XPath.first(channel_node, "title/@type").to_s == "xhtml" ||
1091
+ XPath.first(channel_node, "title/@mode").to_s == "xhtml" ||
1092
+ XPath.first(channel_node, "title/@type").to_s == "xml" ||
1093
+ XPath.first(channel_node, "title/@mode").to_s == "xml" ||
1094
+ XPath.first(channel_node, "title/@type").to_s ==
1095
+ "application/xhtml+xml"
953
1096
  @title = XPath.first(channel_node, "title").inner_xml
954
1097
  elsif XPath.first(channel_node, "title/@type").to_s == "escaped" ||
955
1098
  XPath.first(channel_node, "title/@mode").to_s == "escaped"
956
- @title = CGI.unescapeHTML(
1099
+ @title = FeedTools.unescape_entities(
957
1100
  XPath.first(channel_node, "title/text()").to_s)
958
1101
  else
959
- @title = CGI.unescapeHTML(
960
- XPath.first(channel_node, "title/text()").to_s)
1102
+ title_node = XPath.first(channel_node, "title")
1103
+ @title = title_node.inner_xml
1104
+ repair_entities = true
961
1105
  end
962
1106
  unless @title.nil?
963
- @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
964
- end
965
- if @title != "" && !(@title.nil?)
966
- @title = FeedTools.strip_html(@title).strip
1107
+ @title = FeedTools.sanitize_html(@title, :strip)
1108
+ @title = FeedTools.unescape_entities(@title) if repair_entities
1109
+ @title = FeedTools.tidy_html(@title)
967
1110
  end
968
1111
  @title.gsub!(/\n/, " ")
1112
+ @title.strip!
969
1113
  @title = nil if @title == ""
970
1114
  self.cache_object.title = @title unless self.cache_object.nil?
971
1115
  end
@@ -981,57 +1125,57 @@ module FeedTools
981
1125
  # Returns the feed description
982
1126
  def description
983
1127
  if @description.nil?
984
- # get the feed description from the xml document
985
- @description = XPath.first(channel_node, "description/text()").to_s
986
- if @description != ""
987
- if XPath.first(channel_node, "description/@encoding").to_s != ""
988
- @description = "[Embedded data objects are not supported.]"
1128
+ repair_entities = false
1129
+ description_node = XPath.first(channel_node, "description")
1130
+ if description_node.nil?
1131
+ description_node = XPath.first(channel_node, "tagline")
1132
+ end
1133
+ if description_node.nil?
1134
+ description_node = XPath.first(channel_node, "subtitle")
1135
+ end
1136
+ if description_node.nil?
1137
+ description_node = XPath.first(channel_node, "summary")
1138
+ end
1139
+ if description_node.nil?
1140
+ description_node = XPath.first(channel_node, "abstract")
1141
+ end
1142
+ if description_node.nil?
1143
+ description_node = XPath.first(channel_node, "info")
1144
+ end
1145
+ if description_node.nil?
1146
+ description_node = XPath.first(channel_node, "content:encoded")
1147
+ @bozo = true unless description_node.nil?
1148
+ end
1149
+ if description_node.nil?
1150
+ description_node = XPath.first(channel_node, "content")
1151
+ @bozo = true unless description_node.nil?
1152
+ end
1153
+ if description_node.nil?
1154
+ description_node = XPath.first(channel_node, "xhtml:body")
1155
+ @bozo = true unless description_node.nil?
1156
+ end
1157
+ if description_node.nil?
1158
+ description_node = XPath.first(channel_node, "body")
1159
+ @bozo = true unless description_node.nil?
1160
+ end
1161
+ unless description_node.nil?
1162
+ if XPath.first(description_node, "@encoding").to_s != ""
1163
+ @description =
1164
+ "[Embedded data objects are not currently supported.]"
1165
+ elsif XPath.first(description_node, "@type").to_s == "xhtml" ||
1166
+ XPath.first(description_node, "@mode").to_s == "xhtml" ||
1167
+ XPath.first(description_node, "@type").to_s == "xml" ||
1168
+ XPath.first(description_node, "@mode").to_s == "xml" ||
1169
+ XPath.first(description_node, "@type").to_s ==
1170
+ "application/xhtml+xml"
1171
+ @description = description_node.inner_xml
1172
+ elsif XPath.first(description_node, "@type").to_s == "escaped" ||
1173
+ XPath.first(description_node, "@mode").to_s == "escaped"
1174
+ @description = FeedTools.unescape_entities(
1175
+ description_node.inner_xml)
989
1176
  else
990
- @description = CGI.unescapeHTML(description)
991
- end
992
- end
993
- if @description == ""
994
- @description = XPath.first(channel_node, "subtitle/text()").to_s
995
- if @description != "" &&
996
- XPath.first(channel_node, "subtitle/@mode").to_s == "escaped"
997
- @description = CGI.unescapeHTML(description)
998
- end
999
- end
1000
- if @description == ""
1001
- @description = XPath.first(channel_node, "tagline/text()").to_s
1002
- if @description != "" &&
1003
- XPath.first(channel_node, "tagline/@mode").to_s == "escaped"
1004
- @description = CGI.unescapeHTML(description)
1005
- end
1006
- end
1007
- if @description == "" && XPath.first(channel_node, "tagline") == nil
1008
- @description = XPath.first(channel_node, "info/text()").to_s
1009
- if @description != "" &&
1010
- XPath.first(channel_node, "info/@mode").to_s == "escaped"
1011
- @description = CGI.unescapeHTML(description)
1012
- end
1013
- end
1014
- if @description == ""
1015
- @description = CGI.unescapeHTML(
1016
- XPath.first(channel_node, "abstract/text()").to_s)
1017
- end
1018
- if @description == ""
1019
- @description = CGI.unescapeHTML(
1020
- XPath.first(channel_node, "summary/text()").to_s)
1021
- end
1022
- if @description == ""
1023
- # I don't think this is valid for anyone to do, but this is probably
1024
- # what they meant if they do it.
1025
- @description = CGI.unescapeHTML(
1026
- XPath.first(channel_node, "content:encoded/text()").to_s)
1027
- if @description != ""
1028
- @bozo = true
1029
- end
1030
- end
1031
- if @description == ""
1032
- begin
1033
- @description = XPath.first(channel_node, "description").inner_xml
1034
- rescue
1177
+ @description = description_node.inner_xml
1178
+ repair_entities = true
1035
1179
  end
1036
1180
  end
1037
1181
  if @description == ""
@@ -1043,13 +1187,12 @@ module FeedTools
1043
1187
  @description = "" if @description.nil?
1044
1188
  end
1045
1189
 
1046
- @description =
1047
- FeedTools.sanitize_html(@description) unless @description.nil?
1048
- # If it started with a bunch of divs, hack them right off. We can put
1049
- # them back later if they're needed.
1050
- @description.gsub!(/^(<div[^>]*>)*/, "")
1051
- @description.gsub!(/(<\/div>)*$/, "")
1052
-
1190
+ unless @description.nil?
1191
+ @description = FeedTools.sanitize_html(@description, :strip)
1192
+ @description = FeedTools.unescape_entities(@description) if repair_entities
1193
+ @description = FeedTools.tidy_html(@description)
1194
+ end
1195
+
1053
1196
  @description.gsub!(/\n/, " ") if @description.size < 80
1054
1197
  @description = @description.strip unless @description.nil?
1055
1198
  @description = nil if @description == ""
@@ -1065,7 +1208,7 @@ module FeedTools
1065
1208
  # Returns the contents of the itunes:summary element
1066
1209
  def itunes_summary
1067
1210
  if @itunes_summary.nil?
1068
- @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
1211
+ @itunes_summary = FeedTools.unescape_entities(XPath.first(root_node,
1069
1212
  "itunes:summary/text()").to_s)
1070
1213
  if @itunes_summary == ""
1071
1214
  @itunes_summary = nil
@@ -1084,7 +1227,7 @@ module FeedTools
1084
1227
  # Returns the contents of the itunes:subtitle element
1085
1228
  def itunes_subtitle
1086
1229
  if @itunes_subtitle.nil?
1087
- @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
1230
+ @itunes_subtitle = FeedTools.unescape_entities(XPath.first(root_node,
1088
1231
  "itunes:subtitle/text()").to_s)
1089
1232
  if @itunes_subtitle == ""
1090
1233
  @itunes_subtitle = nil
@@ -1211,6 +1354,197 @@ module FeedTools
1211
1354
  return @icon_link
1212
1355
  end
1213
1356
 
1357
+ # Returns the feed author
1358
+ def author
1359
+ if @author.nil?
1360
+ @author = FeedTools::Feed::Author.new
1361
+
1362
+ # Set the author name
1363
+ @author.name = FeedTools.unescape_entities(
1364
+ XPath.first(channel_node, "author/name/text()").to_s)
1365
+
1366
+ @author.raw = FeedTools.unescape_entities(
1367
+ XPath.first(channel_node, "author/text()").to_s)
1368
+ if @author.raw == ""
1369
+ @author.raw = FeedTools.unescape_entities(
1370
+ XPath.first(channel_node, "dc:creator/text()").to_s)
1371
+ end
1372
+ if @author.raw == ""
1373
+ @author.raw = FeedTools.unescape_entities(
1374
+ XPath.first(channel_node, "dc:author/text()").to_s)
1375
+ end
1376
+ unless @author.raw == ""
1377
+ raw_scan = @author.raw.scan(
1378
+ /(.*)\((\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\)/i)
1379
+ if raw_scan.nil? || raw_scan.size == 0
1380
+ raw_scan = @author.raw.scan(
1381
+ /(\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\s*\((.*)\)/i)
1382
+ author_raw_pair = raw_scan.first.reverse unless raw_scan.size == 0
1383
+ else
1384
+ author_raw_pair = raw_scan.first
1385
+ end
1386
+ if raw_scan.nil? || raw_scan.size == 0
1387
+ email_scan = @author.raw.scan(
1388
+ /\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b/i)
1389
+ if email_scan != nil && email_scan.size > 0
1390
+ @author.email = email_scan.first.strip
1391
+ end
1392
+ end
1393
+ unless author_raw_pair.nil? || author_raw_pair.size == 0
1394
+ @author.name = author_raw_pair.first.strip
1395
+ @author.email = author_raw_pair.last.strip
1396
+ else
1397
+ unless @author.raw.include?("@")
1398
+ # We can be reasonably sure we are looking at something
1399
+ # that the creator didn't intend to contain an email address if
1400
+ # it got through the preceeding regexes and it doesn't
1401
+ # contain the tell-tale '@' symbol.
1402
+ @author.name = @author.raw
1403
+ end
1404
+ end
1405
+ end
1406
+
1407
+ @author.name = nil if @author.name == ""
1408
+ @author.raw = nil if @author.raw == ""
1409
+
1410
+ # Set the author email
1411
+ if @author.email == ""
1412
+ @author.email = FeedTools.unescape_entities(
1413
+ XPath.first(channel_node, "author/email/text()").to_s)
1414
+ end
1415
+ @author.email = nil if @author.email == ""
1416
+
1417
+ # Set the author url
1418
+ @author.url = FeedTools.unescape_entities(
1419
+ XPath.first(channel_node, "author/url/text()").to_s)
1420
+ @author.url = nil if @author.url == ""
1421
+
1422
+ # Fallback on the itunes module if we didn't find an author name
1423
+ begin
1424
+ @author.name = self.itunes_author if @author.name.nil?
1425
+ rescue
1426
+ @author.name = nil
1427
+ end
1428
+ end
1429
+ return @author
1430
+ end
1431
+
1432
+ # Sets the feed author
1433
+ def author=(new_author)
1434
+ if new_author.respond_to?(:name) &&
1435
+ new_author.respond_to?(:email) &&
1436
+ new_author.respond_to?(:url)
1437
+ # It's a complete author object, just set it.
1438
+ @author = new_author
1439
+ else
1440
+ # We're not looking at an author object, this is probably a string,
1441
+ # default to setting the author's name.
1442
+ if @author.nil?
1443
+ @author = FeedTools::Feed::Author.new
1444
+ end
1445
+ @author.name = new_author
1446
+ end
1447
+ end
1448
+
1449
+ # Returns the feed publisher
1450
+ def publisher
1451
+ if @publisher.nil?
1452
+ @publisher = FeedTools::Feed::Author.new
1453
+
1454
+ # Set the author name
1455
+ @publisher.raw = FeedTools.unescape_entities(
1456
+ XPath.first(channel_node, "dc:publisher/text()").to_s)
1457
+ unless @publisher.raw == ""
1458
+ raw_scan = @publisher.raw.scan(
1459
+ /(.*)\((\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\)/i)
1460
+ if raw_scan.nil? || raw_scan.size == 0
1461
+ raw_scan = @publisher.raw.scan(
1462
+ /(\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\s*\((.*)\)/i)
1463
+ unless raw_scan.size == 0
1464
+ publisher_raw_pair = raw_scan.first.reverse
1465
+ end
1466
+ else
1467
+ publisher_raw_pair = raw_scan.first
1468
+ end
1469
+ if raw_scan.nil? || raw_scan.size == 0
1470
+ email_scan = @publisher.raw.scan(
1471
+ /\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b/i)
1472
+ if email_scan != nil && email_scan.size > 0
1473
+ @publisher.email = email_scan.first.strip
1474
+ end
1475
+ end
1476
+ unless publisher_raw_pair.nil? || publisher_raw_pair.size == 0
1477
+ @publisher.name = publisher_raw_pair.first.strip
1478
+ @publisher.email = publisher_raw_pair.last.strip
1479
+ else
1480
+ unless @publisher.raw.include?("@")
1481
+ # We can be reasonably sure we are looking at something
1482
+ # that the creator didn't intend to contain an email address if
1483
+ # it got through the preceeding regexes and it doesn't
1484
+ # contain the tell-tale '@' symbol.
1485
+ @publisher.name = @publisher.raw
1486
+ end
1487
+ end
1488
+ end
1489
+
1490
+ @publisher.name = nil if @publisher.name == ""
1491
+ @publisher.raw = nil if @publisher.raw == ""
1492
+ @publisher.email = nil if @publisher.email == ""
1493
+ @publisher.url = nil if @publisher.url == ""
1494
+ end
1495
+ return @publisher
1496
+ end
1497
+
1498
+ # Sets the feed publisher
1499
+ def publisher=(new_publisher)
1500
+ if new_publisher.respond_to?(:name) &&
1501
+ new_publisher.respond_to?(:email) &&
1502
+ new_publisher.respond_to?(:url)
1503
+ # It's a complete Author object, just set it.
1504
+ @publisher = new_publisher
1505
+ else
1506
+ # We're not looking at an Author object, this is probably a string,
1507
+ # default to setting the publisher's name.
1508
+ if @publisher.nil?
1509
+ @publisher = FeedTools::Feed::Author.new
1510
+ end
1511
+ @publisher.name = new_publisher
1512
+ end
1513
+ end
1514
+
1515
+ # Returns the contents of the itunes:author element
1516
+ #
1517
+ # Returns any incorrectly placed channel-level itunes:author
1518
+ # elements. They're actually amazingly common. People don't read specs.
1519
+ # There is no setter for this, since this is a "bozo" attribute.
1520
+ def itunes_author
1521
+ if @itunes_author.nil?
1522
+ @itunes_author = FeedTools.unescape_entities(XPath.first(channel_node,
1523
+ "itunes:author/text()").to_s)
1524
+ @itunes_author = nil if @itunes_author == ""
1525
+ @bozo = true unless @itunes_author.nil?
1526
+ end
1527
+ return @itunes_author
1528
+ end
1529
+
1530
+ # Returns the feed's copyright information
1531
+ def copyright
1532
+ if @copyright.nil?
1533
+ @copyright = XPath.first(channel_node, "copyright/text()").to_s
1534
+ if @copyright == ""
1535
+ @copyright = XPath.first(channel_node, "dc:rights/text()").to_s
1536
+ end
1537
+ @copyright = FeedTools.sanitize_html(@copyright, :strip)
1538
+ @copyright = nil if @copyright == ""
1539
+ end
1540
+ return @copyright
1541
+ end
1542
+
1543
+ # Sets the feed's copyright information
1544
+ def copyright=(new_copyright)
1545
+ @copyright = new_copyright
1546
+ end
1547
+
1214
1548
  # Returns the number of seconds before the feed should expire
1215
1549
  def time_to_live
1216
1550
  if @time_to_live.nil?
@@ -1275,6 +1609,33 @@ module FeedTools
1275
1609
  @time_to_live = 1 if @time_to_live < 1
1276
1610
  end
1277
1611
 
1612
+ # Returns the feed's cloud
1613
+ def cloud
1614
+ if @cloud.nil?
1615
+ @cloud = FeedTools::Feed::Cloud.new
1616
+ @cloud.domain = XPath.first(channel_node, "cloud/@domain").to_s
1617
+ @cloud.port = XPath.first(channel_node, "cloud/@port").to_s
1618
+ @cloud.path = XPath.first(channel_node, "cloud/@path").to_s
1619
+ @cloud.register_procedure =
1620
+ XPath.first(channel_node, "cloud/@registerProcedure").to_s
1621
+ @cloud.protocol =
1622
+ XPath.first(channel_node, "cloud/@protocol").to_s.downcase
1623
+ @cloud.domain = nil if @cloud.domain == ""
1624
+ @cloud.port = nil if @cloud.port == ""
1625
+ @cloud.port = @cloud.port.to_i unless @cloud.port.nil?
1626
+ @cloud.port = nil if @cloud.port == 0
1627
+ @cloud.path = nil if @cloud.path == ""
1628
+ @cloud.register_procedure = nil if @cloud.register_procedure == ""
1629
+ @cloud.protocol = nil if @cloud.protocol == ""
1630
+ end
1631
+ return @cloud
1632
+ end
1633
+
1634
+ # Sets the feed's cloud
1635
+ def cloud=(new_cloud)
1636
+ @cloud = new_cloud
1637
+ end
1638
+
1278
1639
  # Returns the feed generator
1279
1640
  def generator
1280
1641
  if @generator.nil?
@@ -1454,7 +1815,8 @@ module FeedTools
1454
1815
  end
1455
1816
 
1456
1817
  # Generates xml based on the content of the feed
1457
- def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1818
+ def build_xml(feed_type=(self.feed_type or "rss"), version=0.0,
1819
+ xml_builder=Builder::XmlMarkup.new(:indent => 2))
1458
1820
  if feed_type == "rss" && version == 0.0
1459
1821
  version = 1.0
1460
1822
  elsif feed_type == "atom" && version == 0.0
@@ -1801,22 +2163,47 @@ module FeedTools
1801
2163
  return @root_node
1802
2164
  end
1803
2165
 
2166
+ # Returns the feed items's unique id
2167
+ def id
2168
+ if @id.nil?
2169
+ @id = XPath.first(root_node, "id/text()").to_s
2170
+ if @id == ""
2171
+ @id = XPath.first(root_node, "guid/text()").to_s
2172
+ end
2173
+ @id = nil if @id == ""
2174
+ end
2175
+ return @id
2176
+ end
2177
+
2178
+ # Sets the feed item's unique id
2179
+ def id=(new_id)
2180
+ @id = new_id
2181
+ end
2182
+
1804
2183
  # Returns the feed item title
1805
2184
  def title
1806
2185
  if @title.nil?
2186
+ repair_entities = false
1807
2187
  if XPath.first(root_node, "title/@type").to_s == "xhtml" ||
1808
- XPath.first(root_node, "title/@mode").to_s == "xhtml"
2188
+ XPath.first(root_node, "title/@mode").to_s == "xhtml" ||
2189
+ XPath.first(root_node, "title/@type").to_s == "xml" ||
2190
+ XPath.first(root_node, "title/@mode").to_s == "xml" ||
2191
+ XPath.first(root_node, "title/@type").to_s ==
2192
+ "application/xhtml+xml"
1809
2193
  @title = XPath.first(root_node, "title").inner_xml
1810
2194
  elsif XPath.first(root_node, "title/@type").to_s == "escaped" ||
1811
2195
  XPath.first(root_node, "title/@mode").to_s == "escaped"
1812
- @title = CGI.unescapeHTML(
2196
+ @title = FeedTools.unescape_entities(
1813
2197
  XPath.first(root_node, "title/text()").to_s)
1814
2198
  else
1815
- @title = CGI.unescapeHTML(
1816
- XPath.first(root_node, "title/text()").to_s)
2199
+ title_node = XPath.first(root_node, "title")
2200
+ @title = title_node.inner_xml
2201
+ repair_entities = true
1817
2202
  end
1818
2203
  unless @title.nil?
1819
- @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
2204
+ @title = FeedTools.sanitize_html(@title, :strip)
2205
+ @title = FeedTools.unescape_entities(@title) if repair_entities
2206
+ @title = FeedTools.tidy_html(@title)
1820
2207
  end
1821
2208
  if @title != ""
1822
2209
  # Some blogging tools include the number of comments in a post
@@ -1826,10 +2213,10 @@ module FeedTools
1826
2213
  #
1827
2214
  # If for some incredibly wierd reason you need the actual
1828
2215
  # unstripped title, just use find_node("title/text()").to_s
1829
- @title = FeedTools.strip_html(
1830
- @title.strip.gsub(/\[\d*\]$/, "")).strip
1831
- @title.gsub!(/\n/, " ")
2216
+ @title = @title.strip.gsub(/\[\d*\]$/, "").strip
1832
2217
  end
2218
+ @title.gsub!(/\n/, " ")
2219
+ @title.strip!
1833
2220
  @title = nil if @title == ""
1834
2221
  end
1835
2222
  return @title
@@ -1843,53 +2230,54 @@ module FeedTools
1843
2230
  # Returns the feed item description
1844
2231
  def description
1845
2232
  if @description.nil?
1846
- # get the item content
1847
- @description = ""
1848
- body_node = XPath.first(root_node, "xhtml:body")
1849
- if body_node == nil
1850
- body_node = XPath.first(root_node, "body")
1851
- end
1852
- if body_node != nil
1853
- @description = body_node.inner_xml
1854
- end
1855
- if @description == ""
1856
- @description =
1857
- CGI.unescapeHTML(XPath.first(root_node, "content:encoded/text()").to_s)
1858
- end
1859
- if @description == ""
1860
- begin
1861
- @description = XPath.first(root_node, "description").cdatas.first.to_s
1862
- rescue
1863
- @description = ""
1864
- end
1865
- if @description == ""
1866
- @description = XPath.first(root_node, "description/text()").to_s
1867
- end
1868
- if @description != ""
1869
- if XPath.first(root_node, "description/@encoding").to_s != ""
1870
- # Not supported... yet.
1871
- @description = "[Embedded data objects are not supported.]"
1872
- else
1873
- @description = CGI.unescapeHTML(@description)
1874
- end
1875
- end
1876
- end
1877
- if @description == ""
1878
- @description = XPath.first(root_node, "content/text()").to_s
1879
- if @description != "" &&
1880
- (XPath.first(root_node, "content/@mode").to_s == "escaped" ||
1881
- XPath.first(root_node, "content/@type").to_s == "escaped")
1882
- @description = CGI.unescapeHTML(@description)
1883
- end
1884
- if XPath.first(root_node, "content/@mode").to_s == "xhtml" ||
1885
- XPath.first(root_node, "content/@type").to_s == "xhtml"
1886
- @description = XPath.first(root_node, "content").inner_xml
1887
- end
1888
- end
1889
- if @description == ""
1890
- begin
1891
- @description = XPath.first(root_node, "description").inner_xml
1892
- rescue
2233
+ repair_entities = false
2234
+ description_node = XPath.first(root_node, "description")
2235
+ if description_node.nil?
2236
+ description_node = XPath.first(root_node, "xhtml:body")
2237
+ end
2238
+ if description_node.nil?
2239
+ description_node = XPath.first(root_node, "body")
2240
+ end
2241
+ if description_node.nil?
2242
+ description_node = XPath.first(root_node, "tagline")
2243
+ end
2244
+ if description_node.nil?
2245
+ description_node = XPath.first(root_node, "subtitle")
2246
+ end
2247
+ if description_node.nil?
2248
+ description_node = XPath.first(root_node, "summary")
2249
+ end
2250
+ if description_node.nil?
2251
+ description_node = XPath.first(root_node, "abstract")
2252
+ end
2253
+ if description_node.nil?
2254
+ description_node = XPath.first(root_node, "content:encoded")
2255
+ end
2256
+ if description_node.nil?
2257
+ description_node = XPath.first(root_node, "content")
2258
+ end
2259
+ if description_node.nil?
2260
+ description_node = XPath.first(root_node, "info")
2261
+ @bozo = true unless description_node.nil?
2262
+ end
2263
+ unless description_node.nil?
2264
+ if XPath.first(description_node, "@encoding").to_s != ""
2265
+ @description =
2266
+ "[Embedded data objects are not currently supported.]"
2267
+ elsif XPath.first(description_node, "@type").to_s == "xhtml" ||
2268
+ XPath.first(description_node, "@mode").to_s == "xhtml" ||
2269
+ XPath.first(description_node, "@type").to_s == "xml" ||
2270
+ XPath.first(description_node, "@mode").to_s == "xml" ||
2271
+ XPath.first(description_node, "@type").to_s ==
2272
+ "application/xhtml+xml"
2273
+ @description = description_node.inner_xml
2274
+ elsif XPath.first(description_node, "@type").to_s == "escaped" ||
2275
+ XPath.first(description_node, "@mode").to_s == "escaped"
2276
+ @description = FeedTools.unescape_entities(
2277
+ description_node.inner_xml)
2278
+ else
2279
+ @description = description_node.inner_xml
2280
+ repair_entities = true
1893
2281
  end
1894
2282
  end
1895
2283
  if @description == ""
@@ -1900,20 +2288,13 @@ module FeedTools
1900
2288
  @description = self.itunes_subtitle
1901
2289
  @description = "" if @description.nil?
1902
2290
  end
1903
- if @description == ""
1904
- @description = self.media_text
1905
- @description = "" if @description.nil?
1906
- end
1907
2291
 
1908
2292
  unless @description.nil?
1909
- @description = FeedTools.sanitize_html(@description)
2293
+ @description = FeedTools.sanitize_html(@description, :strip)
2294
+ @description = FeedTools.unescape_entities(@description) if repair_entities
2295
+ @description = FeedTools.tidy_html(@description)
1910
2296
  end
1911
2297
 
1912
- # If it started with a bunch of divs, hack them right off. We can put
1913
- # them back later if they're needed.
1914
- @description.gsub!(/^(<div[^>]*>)*/, "")
1915
- @description.gsub!(/(<\/div>)*$/, "")
1916
-
1917
2298
  @description.gsub!(/\n/, " ") if @description.size < 80
1918
2299
  @description = @description.strip unless @description.nil?
1919
2300
  @description = nil if @description == ""
@@ -1925,6 +2306,66 @@ module FeedTools
1925
2306
  def description=(new_description)
1926
2307
  @description = new_description
1927
2308
  end
2309
+
2310
+ # Returns the contents of the itunes:summary element
2311
+ def itunes_summary
2312
+ if @itunes_summary.nil?
2313
+ @itunes_summary = FeedTools.unescape_entities(XPath.first(root_node,
2314
+ "itunes:summary/text()").to_s)
2315
+ if @itunes_summary == ""
2316
+ @itunes_summary = nil
2317
+ end
2318
+ unless @itunes_summary.nil?
2319
+ @itunes_summary = FeedTools.sanitize_html(@itunes_summary)
2320
+ end
2321
+ end
2322
+ return @itunes_summary
2323
+ end
2324
+
2325
+ # Sets the contents of the itunes:summary element
2326
+ def itunes_summary=(new_itunes_summary)
2327
+ @itunes_summary = new_itunes_summary
2328
+ end
2329
+
2330
+ # Returns the contents of the itunes:subtitle element
2331
+ def itunes_subtitle
2332
+ if @itunes_subtitle.nil?
2333
+ @itunes_subtitle = FeedTools.unescape_entities(XPath.first(root_node,
2334
+ "itunes:subtitle/text()").to_s)
2335
+ if @itunes_subtitle == ""
2336
+ @itunes_subtitle = nil
2337
+ end
2338
+ unless @itunes_subtitle.nil?
2339
+ @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
2340
+ end
2341
+ end
2342
+ return @itunes_subtitle
2343
+ end
2344
+
2345
+ # Sets the contents of the itunes:subtitle element
2346
+ def itunes_subtitle=(new_itunes_subtitle)
2347
+ @itunes_subtitle = new_itunes_subtitle
2348
+ end
2349
+
2350
+ # Returns the contents of the media:text element
2351
+ def media_text
2352
+ if @media_text.nil?
2353
+ @media_text = FeedTools.unescape_entities(XPath.first(root_node,
2354
+ "itunes:subtitle/text()").to_s)
2355
+ if @media_text == ""
2356
+ @media_text = nil
2357
+ end
2358
+ unless @media_text.nil?
2359
+ @media_text = FeedTools.sanitize_html(@media_text)
2360
+ end
2361
+ end
2362
+ return @media_text
2363
+ end
2364
+
2365
+ # Sets the contents of the media:text element
2366
+ def media_text=(new_media_text)
2367
+ @media_text = new_media_text
2368
+ end
1928
2369
 
1929
2370
  # Returns the feed item link
1930
2371
  def link
@@ -1948,7 +2389,7 @@ module FeedTools
1948
2389
  end
1949
2390
  end
1950
2391
  if @link != ""
1951
- @link = CGI.unescapeHTML(@link)
2392
+ @link = FeedTools.unescape_entities(@link)
1952
2393
  end
1953
2394
  if @link != "" && (@link =~ /http:\/\//) != 0 && (@link =~ /https:\/\//) != 0
1954
2395
  if (feed.base[-1..-1] == "/" && @link[0..0] == "/")
@@ -2084,23 +2525,6 @@ module FeedTools
2084
2525
  @media_thumbnail_link = new_media_thumbnail_link
2085
2526
  end
2086
2527
 
2087
- # Returns the feed items's unique id
2088
- def id
2089
- if @id.nil?
2090
- @id = XPath.first(root_node, "id/text()").to_s
2091
- if @id == ""
2092
- @id = XPath.first(root_node, "guid/text()").to_s
2093
- end
2094
- @id = nil if @id == ""
2095
- end
2096
- return @id
2097
- end
2098
-
2099
- # Sets the feed item's unique id
2100
- def id=(new_id)
2101
- @id = new_id
2102
- end
2103
-
2104
2528
  # Returns all feed item enclosures
2105
2529
  def enclosures
2106
2530
  if @enclosures.nil?
@@ -2116,7 +2540,7 @@ module FeedTools
2116
2540
  # sometimes these also manage to show up in atom files.
2117
2541
  for enclosure_node in rss_enclosures
2118
2542
  enclosure = Enclosure.new
2119
- enclosure.url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2543
+ enclosure.url = FeedTools.unescape_entities(enclosure_node.attributes["url"].to_s)
2120
2544
  enclosure.type = enclosure_node.attributes["type"].to_s
2121
2545
  enclosure.file_size = enclosure_node.attributes["length"].to_i
2122
2546
  enclosure.credits = []
@@ -2127,7 +2551,7 @@ module FeedTools
2127
2551
  # Parse atom-type enclosures. If there are repeats of the same enclosure object,
2128
2552
  # we merge the two together.
2129
2553
  for enclosure_node in atom_enclosures
2130
- enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["href"].to_s)
2554
+ enclosure_url = FeedTools.unescape_entities(enclosure_node.attributes["href"].to_s)
2131
2555
  enclosure = nil
2132
2556
  new_enclosure = false
2133
2557
  for existing_enclosure in @enclosures
@@ -2156,7 +2580,7 @@ module FeedTools
2156
2580
  parse_media_content = lambda do |media_content_nodes|
2157
2581
  affected_enclosures = []
2158
2582
  for enclosure_node in media_content_nodes
2159
- enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2583
+ enclosure_url = FeedTools.unescape_entities(enclosure_node.attributes["url"].to_s)
2160
2584
  enclosure = nil
2161
2585
  new_enclosure = false
2162
2586
  for existing_enclosure in @enclosures
@@ -2182,9 +2606,9 @@ module FeedTools
2182
2606
  (enclosure_node.attributes["isDefault"].to_s.downcase == "true")
2183
2607
  if XPath.first(enclosure_node, "media:thumbnail/@url").to_s != ""
2184
2608
  enclosure.thumbnail = EnclosureThumbnail.new(
2185
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@url").to_s),
2186
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@height").to_s),
2187
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@width").to_s)
2609
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:thumbnail/@url").to_s),
2610
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:thumbnail/@height").to_s),
2611
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:thumbnail/@width").to_s)
2188
2612
  )
2189
2613
  if enclosure.thumbnail.height == ""
2190
2614
  enclosure.thumbnail.height = nil
@@ -2196,9 +2620,9 @@ module FeedTools
2196
2620
  enclosure.categories = []
2197
2621
  for category in XPath.match(enclosure_node, "media:category")
2198
2622
  enclosure.categories << EnclosureCategory.new(
2199
- CGI.unescapeHTML(category.text),
2200
- CGI.unescapeHTML(category.attributes["scheme"].to_s),
2201
- CGI.unescapeHTML(category.attributes["label"].to_s)
2623
+ FeedTools.unescape_entities(category.text),
2624
+ FeedTools.unescape_entities(category.attributes["scheme"].to_s),
2625
+ FeedTools.unescape_entities(category.attributes["label"].to_s)
2202
2626
  )
2203
2627
  if enclosure.categories.last.scheme == ""
2204
2628
  enclosure.categories.last.scheme = nil
@@ -2209,16 +2633,16 @@ module FeedTools
2209
2633
  end
2210
2634
  if XPath.first(enclosure_node, "media:hash/text()").to_s != ""
2211
2635
  enclosure.hash = EnclosureHash.new(
2212
- FeedTools.sanitize_html(CGI.unescapeHTML(XPath.first(
2636
+ FeedTools.sanitize_html(FeedTools.unescape_entities(XPath.first(
2213
2637
  enclosure_node, "media:hash/text()").to_s), :strip),
2214
2638
  "md5"
2215
2639
  )
2216
2640
  end
2217
2641
  if XPath.first(enclosure_node, "media:player/@url").to_s != ""
2218
2642
  enclosure.player = EnclosurePlayer.new(
2219
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@url").to_s),
2220
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@height").to_s),
2221
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@width").to_s)
2643
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:player/@url").to_s),
2644
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:player/@height").to_s),
2645
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:player/@width").to_s)
2222
2646
  )
2223
2647
  if enclosure.player.height == ""
2224
2648
  enclosure.player.height = nil
@@ -2230,8 +2654,8 @@ module FeedTools
2230
2654
  enclosure.credits = []
2231
2655
  for credit in XPath.match(enclosure_node, "media:credit")
2232
2656
  enclosure.credits << EnclosureCredit.new(
2233
- CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2234
- CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2657
+ FeedTools.unescape_entities(credit.text),
2658
+ FeedTools.unescape_entities(credit.attributes["role"].to_s.downcase)
2235
2659
  )
2236
2660
  if enclosure.credits.last.role == ""
2237
2661
  enclosure.credits.last.role = nil
@@ -2240,7 +2664,7 @@ module FeedTools
2240
2664
  enclosure.explicit = (XPath.first(enclosure_node,
2241
2665
  "media:adult/text()").to_s.downcase == "true")
2242
2666
  if XPath.first(enclosure_node, "media:text/text()").to_s != ""
2243
- enclosure.text = CGI.unescapeHTML(XPath.first(enclosure_node,
2667
+ enclosure.text = FeedTools.unescape_entities(XPath.first(enclosure_node,
2244
2668
  "media:text/text()").to_s)
2245
2669
  end
2246
2670
  affected_enclosures << enclosure
@@ -2271,11 +2695,11 @@ module FeedTools
2271
2695
  if enclosure.thumbnail.nil? &&
2272
2696
  XPath.first(media_group, "media:thumbnail/@url").to_s != ""
2273
2697
  enclosure.thumbnail = EnclosureThumbnail.new(
2274
- CGI.unescapeHTML(
2698
+ FeedTools.unescape_entities(
2275
2699
  XPath.first(media_group, "media:thumbnail/@url").to_s),
2276
- CGI.unescapeHTML(
2700
+ FeedTools.unescape_entities(
2277
2701
  XPath.first(media_group, "media:thumbnail/@height").to_s),
2278
- CGI.unescapeHTML(
2702
+ FeedTools.unescape_entities(
2279
2703
  XPath.first(media_group, "media:thumbnail/@width").to_s)
2280
2704
  )
2281
2705
  if enclosure.thumbnail.height == ""
@@ -2289,9 +2713,9 @@ module FeedTools
2289
2713
  enclosure.categories = []
2290
2714
  for category in XPath.match(media_group, "media:category")
2291
2715
  enclosure.categories << EnclosureCategory.new(
2292
- CGI.unescapeHTML(category.text),
2293
- CGI.unescapeHTML(category.attributes["scheme"].to_s),
2294
- CGI.unescapeHTML(category.attributes["label"].to_s)
2716
+ FeedTools.unescape_entities(category.text),
2717
+ FeedTools.unescape_entities(category.attributes["scheme"].to_s),
2718
+ FeedTools.unescape_entities(category.attributes["label"].to_s)
2295
2719
  )
2296
2720
  if enclosure.categories.last.scheme == ""
2297
2721
  enclosure.categories.last.scheme = nil
@@ -2304,16 +2728,16 @@ module FeedTools
2304
2728
  if enclosure.hash.nil? &&
2305
2729
  XPath.first(media_group, "media:hash/text()").to_s != ""
2306
2730
  enclosure.hash = EnclosureHash.new(
2307
- CGI.unescapeHTML(XPath.first(media_group, "media:hash/text()").to_s),
2731
+ FeedTools.unescape_entities(XPath.first(media_group, "media:hash/text()").to_s),
2308
2732
  "md5"
2309
2733
  )
2310
2734
  end
2311
2735
  if enclosure.player.nil? &&
2312
2736
  XPath.first(media_group, "media:player/@url").to_s != ""
2313
2737
  enclosure.player = EnclosurePlayer.new(
2314
- CGI.unescapeHTML(XPath.first(media_group, "media:player/@url").to_s),
2315
- CGI.unescapeHTML(XPath.first(media_group, "media:player/@height").to_s),
2316
- CGI.unescapeHTML(XPath.first(media_group, "media:player/@width").to_s)
2738
+ FeedTools.unescape_entities(XPath.first(media_group, "media:player/@url").to_s),
2739
+ FeedTools.unescape_entities(XPath.first(media_group, "media:player/@height").to_s),
2740
+ FeedTools.unescape_entities(XPath.first(media_group, "media:player/@width").to_s)
2317
2741
  )
2318
2742
  if enclosure.player.height == ""
2319
2743
  enclosure.player.height = nil
@@ -2326,8 +2750,8 @@ module FeedTools
2326
2750
  enclosure.credits = []
2327
2751
  for credit in XPath.match(media_group, "media:credit")
2328
2752
  enclosure.credits << EnclosureCredit.new(
2329
- CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2330
- CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2753
+ FeedTools.unescape_entities(credit.text),
2754
+ FeedTools.unescape_entities(credit.attributes["role"].to_s.downcase)
2331
2755
  )
2332
2756
  if enclosure.credits.last.role == ""
2333
2757
  enclosure.credits.last.role = nil
@@ -2340,7 +2764,7 @@ module FeedTools
2340
2764
  end
2341
2765
  if enclosure.text.nil? &&
2342
2766
  XPath.first(media_group, "media:text/text()").to_s != ""
2343
- enclosure.text = FeedTools.sanitize_html(CGI.unescapeHTML(
2767
+ enclosure.text = FeedTools.sanitize_html(FeedTools.unescape_entities(
2344
2768
  XPath.first(media_group, "media:text/text()").to_s), :strip)
2345
2769
  end
2346
2770
  end
@@ -2373,9 +2797,9 @@ module FeedTools
2373
2797
  enclosure.categories = []
2374
2798
  end
2375
2799
  enclosure.categories << EnclosureCategory.new(
2376
- CGI.unescapeHTML(category_path),
2377
- CGI.unescapeHTML("http://www.apple.com/itunes/store/"),
2378
- CGI.unescapeHTML("iTunes Music Store Categories")
2800
+ FeedTools.unescape_entities(category_path),
2801
+ FeedTools.unescape_entities("http://www.apple.com/itunes/store/"),
2802
+ FeedTools.unescape_entities("iTunes Music Store Categories")
2379
2803
  )
2380
2804
  end
2381
2805
  end
@@ -2464,136 +2888,140 @@ module FeedTools
2464
2888
  def enclosures=(new_enclosures)
2465
2889
  @enclosures = new_enclosures
2466
2890
  end
2467
-
2468
- # Returns the feed item author
2469
- def author_name
2470
- # TODO: make this not suck, actually ensure we're looking at a name
2471
- # and not an email address.
2472
- # Also, factor in itunes module.
2473
- # =================================================================
2474
- if @author_name.nil?
2475
- @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/name/text()").to_s)
2476
- if @author_name == ""
2477
- @author_name = CGI.unescapeHTML(XPath.first(root_node, "dc:creator/text()").to_s)
2478
- end
2479
- if @author_name == ""
2480
- @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/text()").to_s)
2481
- end
2482
- end
2483
- return @author_name
2484
- end
2485
2891
 
2486
- # Sets the feed item author
2487
- def author_name=(new_author_name)
2488
- @author_name = new_author_name
2489
- end
2490
-
2491
- # Returns the contents of the itunes:summary element
2492
- def itunes_summary
2493
- if @itunes_summary.nil?
2494
- @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
2495
- "itunes:summary/text()").to_s)
2496
- if @itunes_summary == ""
2497
- @itunes_summary = nil
2498
- end
2499
- unless @itunes_summary.nil?
2500
- @itunes_summary = FeedTools.sanitize_html(@itunes_summary)
2892
+ # Returns the feed item author
2893
+ def author
2894
+ if @author.nil?
2895
+ @author = FeedTools::Feed::Author.new
2896
+
2897
+ # Set the author name
2898
+ @author.name = FeedTools.unescape_entities(
2899
+ XPath.first(root_node, "author/name/text()").to_s)
2900
+
2901
+ @author.raw = FeedTools.unescape_entities(
2902
+ XPath.first(root_node, "author/text()").to_s)
2903
+ if @author.raw == ""
2904
+ @author.raw = FeedTools.unescape_entities(
2905
+ XPath.first(root_node, "dc:creator/text()").to_s)
2906
+ end
2907
+ if @author.raw == ""
2908
+ @author.raw = FeedTools.unescape_entities(
2909
+ XPath.first(root_node, "dc:author/text()").to_s)
2910
+ end
2911
+ unless @author.raw == ""
2912
+ raw_scan = @author.raw.scan(
2913
+ /(.*)\((\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\)/i)
2914
+ if raw_scan.nil? || raw_scan.size == 0
2915
+ raw_scan = @author.raw.scan(
2916
+ /(\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\s*\((.*)\)/i)
2917
+ author_raw_pair = raw_scan.first.reverse unless raw_scan.size == 0
2918
+ else
2919
+ author_raw_pair = raw_scan.first
2920
+ end
2921
+ if raw_scan.nil? || raw_scan.size == 0
2922
+ email_scan = @author.raw.scan(
2923
+ /\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b/i)
2924
+ if email_scan != nil && email_scan.size > 0
2925
+ @author.email = email_scan.first.strip
2926
+ end
2927
+ end
2928
+ unless author_raw_pair.nil? || author_raw_pair.size == 0
2929
+ @author.name = author_raw_pair.first.strip
2930
+ @author.email = author_raw_pair.last.strip
2931
+ else
2932
+ unless @author.raw.include?("@")
2933
+ # We can be reasonably sure we are looking at something
2934
+ # that the creator didn't intend to contain an email address if
2935
+ # it got through the preceeding regexes and it doesn't
2936
+ # contain the tell-tale '@' symbol.
2937
+ @author.name = @author.raw
2938
+ end
2939
+ end
2501
2940
  end
2502
- end
2503
- return @itunes_summary
2504
- end
2505
2941
 
2506
- # Sets the contents of the itunes:summary element
2507
- def itunes_summary=(new_itunes_summary)
2508
- @itunes_summary = new_itunes_summary
2509
- end
2942
+ @author.name = nil if @author.name == ""
2943
+ @author.raw = nil if @author.raw == ""
2510
2944
 
2511
- # Returns the contents of the itunes:subtitle element
2512
- def itunes_subtitle
2513
- if @itunes_subtitle.nil?
2514
- @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
2515
- "itunes:subtitle/text()").to_s)
2516
- if @itunes_subtitle == ""
2517
- @itunes_subtitle = nil
2945
+ # Set the author email
2946
+ if @author.email == ""
2947
+ @author.email = FeedTools.unescape_entities(
2948
+ XPath.first(root_node, "author/email/text()").to_s)
2518
2949
  end
2519
- unless @itunes_subtitle.nil?
2520
- @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
2521
- end
2522
- end
2523
- return @itunes_subtitle
2524
- end
2950
+ @author.email = nil if @author.email == ""
2525
2951
 
2526
- # Sets the contents of the itunes:subtitle element
2527
- def itunes_subtitle=(new_itunes_subtitle)
2528
- @itunes_subtitle = new_itunes_subtitle
2529
- end
2952
+ # Set the author url
2953
+ @author.url = FeedTools.unescape_entities(
2954
+ XPath.first(root_node, "author/url/text()").to_s)
2955
+ @author.url = nil if @author.url == ""
2530
2956
 
2531
- # Returns the contents of the media:text element
2532
- def media_text
2533
- if @media_text.nil?
2534
- @media_text = CGI.unescapeHTML(XPath.first(root_node,
2535
- "itunes:subtitle/text()").to_s)
2536
- if @media_text == ""
2537
- @media_text = nil
2538
- end
2539
- unless @media_text.nil?
2540
- @media_text = FeedTools.sanitize_html(@media_text)
2957
+ # Fallback on the itunes module if we didn't find an author name
2958
+ begin
2959
+ @author.name = self.itunes_author if @author.name.nil?
2960
+ rescue
2961
+ @author.name = nil
2541
2962
  end
2542
2963
  end
2543
- return @media_text
2964
+ return @author
2544
2965
  end
2545
-
2546
- # Sets the contents of the media:text element
2547
- def media_text=(new_media_text)
2548
- @media_text = new_media_text
2966
+
2967
+ # Sets the feed item author
2968
+ def author=(new_author)
2969
+ if new_author.respond_to?(:name) &&
2970
+ new_author.respond_to?(:email) &&
2971
+ new_author.respond_to?(:url)
2972
+ # It's a complete author object, just set it.
2973
+ @author = new_author
2974
+ else
2975
+ # We're not looking at an author object, this is probably a string,
2976
+ # default to setting the author's name.
2977
+ if @author.nil?
2978
+ @author = FeedTools::Feed::Author.new
2979
+ end
2980
+ @author.name = new_author
2981
+ end
2549
2982
  end
2550
2983
 
2551
2984
  # Returns the contents of the itunes:author element
2552
2985
  #
2553
2986
  # This inherits from any incorrectly placed channel-level itunes:author
2554
- # elements. They're actually amazingly commong. People don't read specs.
2987
+ # elements. They're actually amazingly common. People don't read specs.
2555
2988
  def itunes_author
2556
2989
  if @itunes_author.nil?
2557
- @itunes_author = CGI.unescapeHTML(XPath.first(root_node,
2990
+ @itunes_author = FeedTools.unescape_entities(XPath.first(root_node,
2558
2991
  "itunes:author/text()").to_s)
2559
- if @itunes_author == ""
2560
- @itunes_author = CGI.unescapeHTML(XPath.first(feed.channel_node,
2561
- "itunes:author/text()").to_s)
2562
- end
2563
- if @itunes_author == ""
2564
- @itunes_author = nil
2565
- end
2992
+ @itunes_author = feed.itunes_author if @itunes_author == ""
2993
+ @itunes_author = nil if @itunes_author == ""
2566
2994
  end
2567
2995
  return @itunes_author
2568
2996
  end
2569
-
2997
+
2570
2998
  # Sets the contents of the itunes:author element
2571
2999
  def itunes_author=(new_itunes_author)
2572
3000
  @itunes_author = new_itunes_author
2573
- end
2574
-
3001
+ end
3002
+
2575
3003
  # Returns the number of seconds that the associated media runs for
2576
- def duration
2577
- if @duration.nil?
2578
- itunes_duration = CGI.unescapeHTML(XPath.first(root_node,
3004
+ def itunes_duration
3005
+ if @itunes_duration.nil?
3006
+ raw_duration = FeedTools.unescape_entities(XPath.first(root_node,
2579
3007
  "itunes:duration/text()").to_s)
2580
- if itunes_duration != ""
2581
- hms = itunes_duration.split(":").map { |x| x.to_i }
3008
+ if raw_duration != ""
3009
+ hms = raw_duration.split(":").map { |x| x.to_i }
2582
3010
  if hms.size == 3
2583
- @duration = hms[0].hour + hms[1].minute + hms[2]
3011
+ @itunes_duration = hms[0].hour + hms[1].minute + hms[2]
2584
3012
  elsif hms.size == 2
2585
- @duration = hms[0].minute + hms[1]
3013
+ @itunes_duration = hms[0].minute + hms[1]
2586
3014
  elsif hms.size == 1
2587
- @duration = hms[0]
3015
+ @itunes_duration = hms[0]
2588
3016
  end
2589
3017
  end
2590
3018
  end
2591
- return @duration
3019
+ return @itunes_duration
2592
3020
  end
2593
3021
 
2594
3022
  # Sets the number of seconds that the associate media runs for
2595
- def duration=(new_duration)
2596
- @duration = new_duration
3023
+ def itunes_duration=(new_itunes_duration)
3024
+ @itunes_duration = new_itunes_duration
2597
3025
  end
2598
3026
 
2599
3027
  # Sets the itunes:summary
@@ -2722,7 +3150,8 @@ module FeedTools
2722
3150
  end
2723
3151
 
2724
3152
  # Generates xml based on the content of the feed item
2725
- def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
3153
+ def build_xml(feed_type=(self.feed.feed_type or "rss"), version=0.0,
3154
+ xml_builder=Builder::XmlMarkup.new(:indent => 2))
2726
3155
  if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
2727
3156
  # RDF-based rss format
2728
3157
  if link.nil?
@@ -2831,9 +3260,9 @@ module FeedTools
2831
3260
  end
2832
3261
  end
2833
3262
 
2834
- module REXML #:nodoc:
2835
- class Element #:nodoc:
2836
- def inner_xml #:nodoc:
3263
+ module REXML # :nodoc:
3264
+ class Element # :nodoc:
3265
+ def inner_xml # :nodoc:
2837
3266
  result = ""
2838
3267
  self.each_child do |child|
2839
3268
  result << child.to_s
@@ -2848,4 +3277,4 @@ begin
2848
3277
  FeedTools.feed_cache.initialize_cache
2849
3278
  end
2850
3279
  rescue
2851
- end
3280
+ end