feedtools 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ == FeedTools 0.2.2
2
+ * fixed http redirection bug
3
+ * fixed several documentation typos
4
+ * still more unit tests
5
+ * improved support for atom
6
+ * minor improvements to the database caching mechanism
7
+ * more complete support for rss elements
8
+ * major improvements to the handling of tags containing html content
1
9
  == FeedTools 0.2.1
2
10
  * fixed incorrect dependancy on ActiveRecord 1.10.1
3
11
  * more unit tests
data/lib/feed_tools.rb CHANGED
@@ -25,7 +25,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
25
25
  ENV['RAILS_ENV'] ||
26
26
  'production' # :nodoc:
27
27
 
28
- FEED_TOOLS_VERSION = "0.2.1"
28
+ FEED_TOOLS_VERSION = "0.2.2"
29
29
 
30
30
  $:.unshift(File.dirname(__FILE__))
31
31
  $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib")
@@ -131,6 +131,19 @@ module FeedTools
131
131
  end
132
132
  return nil
133
133
  end
134
+
135
+ # Returns true if a connection to the database has been established and the
136
+ # required table structure is in place.
137
+ def DatabaseFeedCache.connected?
138
+ begin
139
+ ActiveRecord::Base.connection
140
+ return false if ActiveRecord::Base.configurations.nil?
141
+ return false unless DatabaseFeedCache.table_exists?
142
+ rescue => error
143
+ return false
144
+ end
145
+ return true
146
+ end
134
147
 
135
148
  # True if the appropriate database table already exists
136
149
  def DatabaseFeedCache.table_exists?
@@ -258,12 +271,25 @@ module FeedTools
258
271
  # find_by_id
259
272
  # find_by_url
260
273
  # initialize_cache
274
+ # connected?
261
275
  def FeedTools.feed_cache=(new_feed_cache)
262
276
  # TODO: ensure that the feed cache class actually does those things.
263
277
  # ==================================================================
264
278
  @feed_cache = new_feed_cache
265
279
  end
266
280
 
281
+ # Returns true if FeedTools.feed_cache is not nil and a connection with
282
+ # the cache has been successfully established. Also returns false if an
283
+ # error is raised while trying to determine the status of the cache.
284
+ def FeedTools.feed_cache_connected?
285
+ begin
286
+ return false if FeedTools.feed_cache.nil?
287
+ return FeedTools.feed_cache.connected?
288
+ rescue
289
+ return false
290
+ end
291
+ end
292
+
267
293
  # Returns the currently used user agent string.
268
294
  def FeedTools.user_agent
269
295
  return @user_agent
@@ -455,6 +481,25 @@ module FeedTools
455
481
  return true
456
482
  end
457
483
 
484
+ # Escapes all html entities
485
+ def FeedTools.escape_entities(html)
486
+ escaped_html = CGI.escapeHTML(html)
487
+ unescaped_html.gsub!(/'/, "'")
488
+ unescaped_html.gsub!(/"/, """)
489
+ return escaped_html
490
+ end
491
+
492
+ # Unescapes all html entities
493
+ def FeedTools.unescape_entities(html)
494
+ unescaped_html = html
495
+ unescaped_html.gsub!(/&/, "&")
496
+ unescaped_html.gsub!(/&/, "&")
497
+ unescaped_html = CGI.unescapeHTML(unescaped_html)
498
+ unescaped_html.gsub!(/'/, "'")
499
+ unescaped_html.gsub!(/"/, "\"")
500
+ return unescaped_html
501
+ end
502
+
458
503
  # Removes all html tags from the html formatted text.
459
504
  def FeedTools.strip_html(html)
460
505
  # TODO: do this properly
@@ -467,6 +512,7 @@ module FeedTools
467
512
  def FeedTools.tidy_html(html)
468
513
  if FeedTools.tidy_enabled?
469
514
  is_fragment = true
515
+ html.gsub!(/<!'/, "<!'")
470
516
  if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
471
517
  (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
472
518
  is_fragment = false
@@ -484,13 +530,15 @@ module FeedTools
484
530
  xml
485
531
  end
486
532
  if is_fragment
487
- # Tidy puts <html>...<body>[our html]</body>...</html> in.
533
+ # Tidy sticks <html>...<body>[our html]</body>...</html> in.
488
534
  # We don't want this.
489
535
  tidy_html.strip!
490
536
  tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
491
537
  tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
492
538
  tidy_html.strip!
493
539
  end
540
+ tidy_html.gsub!(/&#x26;/, "&amp;")
541
+ tidy_html.gsub!(/&#38;/, "&amp;")
494
542
  else
495
543
  tidy_html = html
496
544
  end
@@ -502,7 +550,7 @@ module FeedTools
502
550
  # be escaped. If mode is set to :strip, dangerous and unknown
503
551
  # elements and all children will be removed entirely.
504
552
  # Dangerous or unknown attributes are always removed.
505
- def FeedTools.sanitize_html(html, mode=:escape)
553
+ def FeedTools.sanitize_html(html, mode=:strip)
506
554
 
507
555
  # Lists borrowed from Mark Pilgrim's feedparser
508
556
  acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -527,18 +575,14 @@ module FeedTools
527
575
  'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
528
576
  'type', 'usemap', 'valign', 'value', 'vspace', 'width']
529
577
 
530
- # Stupid hack to pass this unit test:
531
- # http://feedparser.org/tests/wellformed/rss/
532
- # item_description_not_a_doctype.xml
533
- html.gsub!(/<!'/, "&lt;!'")
534
-
535
- # The closer we are to proper xhtml, the more accurate the
536
- # sanitization will be.
537
- html = FeedTools.tidy_html(html)
538
-
578
+ # Replace with appropriate named entities
579
+ html.gsub!(/&#x26;/, "&amp;")
580
+ html.gsub!(/&#38;/, "&amp;")
581
+ html.gsub!(/&lt;!'/, "&amp;lt;!'")
582
+
539
583
  # Hackity hack. But it works, and it seems plenty fast enough.
540
584
  html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
541
-
585
+
542
586
  sanitize_node = lambda do |html_node|
543
587
  if html_node.respond_to? :children
544
588
  for child in html_node.children
@@ -564,13 +608,83 @@ module FeedTools
564
608
  html_node
565
609
  end
566
610
  sanitize_node.call(html_doc.root)
567
- return html_doc.root.inner_xml
611
+ html = html_doc.root.inner_xml
612
+ return html
568
613
  end
569
614
 
570
615
  class Feed
571
616
  include REXML
572
617
  include AttributeDictionary
573
618
 
619
+ # Represents a feed/feed item's category
620
+ class Category
621
+ # The category term value
622
+ attr_accessor :term
623
+ # The categorization scheme
624
+ attr_accessor :scheme
625
+ # A human-readable description of the category
626
+ attr_accessor :label
627
+
628
+ # Relays any unknown methods to the term so that you can treat the
629
+ # category object as a string
630
+ def method_missing(msg, *params)
631
+ self.term.send(msg, params)
632
+ end
633
+
634
+ # Relays the to_s method to the term field
635
+ def to_s
636
+ self.term.to_s
637
+ end
638
+
639
+ # Relays the inspect method to the term field
640
+ def inspect
641
+ self.term.inspect
642
+ end
643
+ end
644
+
645
+ # Represents a feed/feed item's author
646
+ class Author
647
+ # The author's real name
648
+ attr_accessor :name
649
+ # The author's email address
650
+ attr_accessor :email
651
+ # The url of the author's homepage
652
+ attr_accessor :url
653
+ # The raw value of the author tag if present
654
+ attr_accessor :raw
655
+
656
+ # Relays any unknown methods to the name so that you can treat the
657
+ # author object as a string
658
+ def method_missing(msg, *params)
659
+ self.name.send(msg, params)
660
+ end
661
+
662
+ # Relays the to_s method to the name field
663
+ def to_s
664
+ self.name.to_s
665
+ end
666
+
667
+ # Relays the inspect method to the name field
668
+ def inspect
669
+ self.name.inspect
670
+ end
671
+ end
672
+
673
+ # Represents a feed's cloud.
674
+ class Cloud
675
+ # The domain of the cloud.
676
+ attr_accessor :domain
677
+ # The path for the cloud.
678
+ attr_accessor :path
679
+ # The port the cloud is listening on.
680
+ attr_accessor :port
681
+ # The web services protocol the cloud uses.
682
+ # Possible values are either "xml-rpc" or "soap".
683
+ attr_accessor :protocol
684
+ # The procedure to use to request notification.
685
+ attr_accessor :register_procedure
686
+ end
687
+
574
688
  # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired.
575
689
  def Feed.open(url)
576
690
  # clean up the url
@@ -671,14 +785,14 @@ module FeedTools
671
785
 
672
786
  Net::HTTP.start(feed_uri.host, (feed_uri.port or 80)) do |http|
673
787
  response = http.request_get(feed_uri.path, http_headers)
674
-
788
+
675
789
  case response
676
790
  when Net::HTTPSuccess
677
791
  # We've reached the final destination, process all previous
678
792
  # redirections, and see if we need to update the url.
679
793
  for redirected_response in response_chain
680
794
  if redirected_response.last.code.to_i == 301
681
- self.url = redirected_response.first
795
+ self.url = redirected_response.last['location']
682
796
  else
683
797
  # Jump out as soon as we hit anything that isn't a
684
798
  # permanently moved redirection.
@@ -690,7 +804,7 @@ module FeedTools
690
804
  if response.code.to_i == 304
691
805
  response.error!
692
806
  else
693
- if response['Location'].nil?
807
+ if response['location'].nil?
694
808
  raise FeedAccessError,
695
809
  "No location to redirect to supplied: " + response.code
696
810
  end
@@ -913,6 +1027,30 @@ module FeedTools
913
1027
  @cache_object = new_cache_object
914
1028
  end
915
1029
 
1030
+ # Returns the type of feed
1031
+ # Possible values:
1032
+ # "rss", "atom", "cdf", "!okay/news"
1033
+ def feed_type
1034
+ if @feed_type.nil?
1035
+ case self.root_node.name.downcase
1036
+ when "feed"
1037
+ @feed_type = "atom"
1038
+ when "rdf:rdf"
1039
+ @feed_type = "rss"
1040
+ when "rss"
1041
+ @feed_type = "rss"
1042
+ when "channel"
1043
+ @feed_type = "cdf"
1044
+ end
1045
+ end
1046
+ return @feed_type
1047
+ end
1048
+
1049
+ # Sets the default feed type
1050
+ def feed_type=(new_feed_type)
1051
+ @feed_type = new_feed_type
1052
+ end
1053
+
916
1054
  # Returns the feed's unique id
917
1055
  def id
918
1056
  if @id.nil?
@@ -948,24 +1086,30 @@ module FeedTools
948
1086
  # Returns the feed title
949
1087
  def title
950
1088
  if @title.nil?
951
- if XPath.first(channel_node, "title/@type").to_s == "xhtml" ||
952
- XPath.first(channel_node, "title/@mode").to_s == "xhtml"
1089
+ repair_entities = false
1090
+ if XPath.first(channel_node, "title/@type").to_s == "xhtml" ||
1091
+ XPath.first(channel_node, "title/@mode").to_s == "xhtml" ||
1092
+ XPath.first(channel_node, "title/@type").to_s == "xml" ||
1093
+ XPath.first(channel_node, "title/@mode").to_s == "xml" ||
1094
+ XPath.first(channel_node, "title/@type").to_s ==
1095
+ "application/xhtml+xml"
953
1096
  @title = XPath.first(channel_node, "title").inner_xml
954
1097
  elsif XPath.first(channel_node, "title/@type").to_s == "escaped" ||
955
1098
  XPath.first(channel_node, "title/@mode").to_s == "escaped"
956
- @title = CGI.unescapeHTML(
1099
+ @title = FeedTools.unescape_entities(
957
1100
  XPath.first(channel_node, "title/text()").to_s)
958
1101
  else
959
- @title = CGI.unescapeHTML(
960
- XPath.first(channel_node, "title/text()").to_s)
1102
+ title_node = XPath.first(channel_node, "title")
1103
+ @title = title_node.inner_xml
1104
+ repair_entities = true
961
1105
  end
962
1106
  unless @title.nil?
963
- @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
964
- end
965
- if @title != "" && !(@title.nil?)
966
- @title = FeedTools.strip_html(@title).strip
1107
+ @title = FeedTools.sanitize_html(@title, :strip)
1108
+ @title = FeedTools.unescape_entities(@title) if repair_entities
1109
+ @title = FeedTools.tidy_html(@title)
967
1110
  end
968
1111
  @title.gsub!(/\n/, " ")
1112
+ @title.strip!
969
1113
  @title = nil if @title == ""
970
1114
  self.cache_object.title = @title unless self.cache_object.nil?
971
1115
  end
@@ -981,57 +1125,57 @@ module FeedTools
981
1125
  # Returns the feed description
982
1126
  def description
983
1127
  if @description.nil?
984
- # get the feed description from the xml document
985
- @description = XPath.first(channel_node, "description/text()").to_s
986
- if @description != ""
987
- if XPath.first(channel_node, "description/@encoding").to_s != ""
988
- @description = "[Embedded data objects are not supported.]"
1128
+ repair_entities = false
1129
+ description_node = XPath.first(channel_node, "description")
1130
+ if description_node.nil?
1131
+ description_node = XPath.first(channel_node, "tagline")
1132
+ end
1133
+ if description_node.nil?
1134
+ description_node = XPath.first(channel_node, "subtitle")
1135
+ end
1136
+ if description_node.nil?
1137
+ description_node = XPath.first(channel_node, "summary")
1138
+ end
1139
+ if description_node.nil?
1140
+ description_node = XPath.first(channel_node, "abstract")
1141
+ end
1142
+ if description_node.nil?
1143
+ description_node = XPath.first(channel_node, "info")
1144
+ end
1145
+ if description_node.nil?
1146
+ description_node = XPath.first(channel_node, "content:encoded")
1147
+ @bozo = true unless description_node.nil?
1148
+ end
1149
+ if description_node.nil?
1150
+ description_node = XPath.first(channel_node, "content")
1151
+ @bozo = true unless description_node.nil?
1152
+ end
1153
+ if description_node.nil?
1154
+ description_node = XPath.first(channel_node, "xhtml:body")
1155
+ @bozo = true unless description_node.nil?
1156
+ end
1157
+ if description_node.nil?
1158
+ description_node = XPath.first(channel_node, "body")
1159
+ @bozo = true unless description_node.nil?
1160
+ end
1161
+ unless description_node.nil?
1162
+ if XPath.first(description_node, "@encoding").to_s != ""
1163
+ @description =
1164
+ "[Embedded data objects are not currently supported.]"
1165
+ elsif XPath.first(description_node, "@type").to_s == "xhtml" ||
1166
+ XPath.first(description_node, "@mode").to_s == "xhtml" ||
1167
+ XPath.first(description_node, "@type").to_s == "xml" ||
1168
+ XPath.first(description_node, "@mode").to_s == "xml" ||
1169
+ XPath.first(description_node, "@type").to_s ==
1170
+ "application/xhtml+xml"
1171
+ @description = description_node.inner_xml
1172
+ elsif XPath.first(description_node, "@type").to_s == "escaped" ||
1173
+ XPath.first(description_node, "@mode").to_s == "escaped"
1174
+ @description = FeedTools.unescape_entities(
1175
+ description_node.inner_xml)
989
1176
  else
990
- @description = CGI.unescapeHTML(description)
991
- end
992
- end
993
- if @description == ""
994
- @description = XPath.first(channel_node, "subtitle/text()").to_s
995
- if @description != "" &&
996
- XPath.first(channel_node, "subtitle/@mode").to_s == "escaped"
997
- @description = CGI.unescapeHTML(description)
998
- end
999
- end
1000
- if @description == ""
1001
- @description = XPath.first(channel_node, "tagline/text()").to_s
1002
- if @description != "" &&
1003
- XPath.first(channel_node, "tagline/@mode").to_s == "escaped"
1004
- @description = CGI.unescapeHTML(description)
1005
- end
1006
- end
1007
- if @description == "" && XPath.first(channel_node, "tagline") == nil
1008
- @description = XPath.first(channel_node, "info/text()").to_s
1009
- if @description != "" &&
1010
- XPath.first(channel_node, "info/@mode").to_s == "escaped"
1011
- @description = CGI.unescapeHTML(description)
1012
- end
1013
- end
1014
- if @description == ""
1015
- @description = CGI.unescapeHTML(
1016
- XPath.first(channel_node, "abstract/text()").to_s)
1017
- end
1018
- if @description == ""
1019
- @description = CGI.unescapeHTML(
1020
- XPath.first(channel_node, "summary/text()").to_s)
1021
- end
1022
- if @description == ""
1023
- # I don't think this is valid for anyone to do, but this is probably
1024
- # what they meant if they do it.
1025
- @description = CGI.unescapeHTML(
1026
- XPath.first(channel_node, "content:encoded/text()").to_s)
1027
- if @description != ""
1028
- @bozo = true
1029
- end
1030
- end
1031
- if @description == ""
1032
- begin
1033
- @description = XPath.first(channel_node, "description").inner_xml
1034
- rescue
1177
+ @description = description_node.inner_xml
1178
+ repair_entities = true
1035
1179
  end
1036
1180
  end
1037
1181
  if @description == ""
@@ -1043,13 +1187,12 @@ module FeedTools
1043
1187
  @description = "" if @description.nil?
1044
1188
  end
1045
1189
 
1046
- @description =
1047
- FeedTools.sanitize_html(@description) unless @description.nil?
1048
- # If it started with a bunch of divs, hack them right off. We can put
1049
- # them back later if they're needed.
1050
- @description.gsub!(/^(<div[^>]*>)*/, "")
1051
- @description.gsub!(/(<\/div>)*$/, "")
1052
-
1190
+ unless @description.nil?
1191
+ @description = FeedTools.sanitize_html(@description, :strip)
1192
+ @description = FeedTools.unescape_entities(@description) if repair_entities
1193
+ @description = FeedTools.tidy_html(@description)
1194
+ end
1195
+
1053
1196
  @description.gsub!(/\n/, " ") if @description.size < 80
1054
1197
  @description = @description.strip unless @description.nil?
1055
1198
  @description = nil if @description == ""
@@ -1065,7 +1208,7 @@ module FeedTools
1065
1208
  # Returns the contents of the itunes:summary element
1066
1209
  def itunes_summary
1067
1210
  if @itunes_summary.nil?
1068
- @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
1211
+ @itunes_summary = FeedTools.unescape_entities(XPath.first(root_node,
1069
1212
  "itunes:summary/text()").to_s)
1070
1213
  if @itunes_summary == ""
1071
1214
  @itunes_summary = nil
@@ -1084,7 +1227,7 @@ module FeedTools
1084
1227
  # Returns the contents of the itunes:subtitle element
1085
1228
  def itunes_subtitle
1086
1229
  if @itunes_subtitle.nil?
1087
- @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
1230
+ @itunes_subtitle = FeedTools.unescape_entities(XPath.first(root_node,
1088
1231
  "itunes:subtitle/text()").to_s)
1089
1232
  if @itunes_subtitle == ""
1090
1233
  @itunes_subtitle = nil
@@ -1211,6 +1354,197 @@ module FeedTools
1211
1354
  return @icon_link
1212
1355
  end
1213
1356
 
1357
+ # Returns the feed author
1358
+ def author
1359
+ if @author.nil?
1360
+ @author = FeedTools::Feed::Author.new
1361
+
1362
+ # Set the author name
1363
+ @author.name = FeedTools.unescape_entities(
1364
+ XPath.first(channel_node, "author/name/text()").to_s)
1365
+
1366
+ @author.raw = FeedTools.unescape_entities(
1367
+ XPath.first(channel_node, "author/text()").to_s)
1368
+ if @author.raw == ""
1369
+ @author.raw = FeedTools.unescape_entities(
1370
+ XPath.first(channel_node, "dc:creator/text()").to_s)
1371
+ end
1372
+ if @author.raw == ""
1373
+ @author.raw = FeedTools.unescape_entities(
1374
+ XPath.first(channel_node, "dc:author/text()").to_s)
1375
+ end
1376
+ unless @author.raw == ""
1377
+ raw_scan = @author.raw.scan(
1378
+ /(.*)\((\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\)/i)
1379
+ if raw_scan.nil? || raw_scan.size == 0
1380
+ raw_scan = @author.raw.scan(
1381
+ /(\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\s*\((.*)\)/i)
1382
+ author_raw_pair = raw_scan.first.reverse unless raw_scan.size == 0
1383
+ else
1384
+ author_raw_pair = raw_scan.first
1385
+ end
1386
+ if raw_scan.nil? || raw_scan.size == 0
1387
+ email_scan = @author.raw.scan(
1388
+ /\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b/i)
1389
+ if email_scan != nil && email_scan.size > 0
1390
+ @author.email = email_scan.first.strip
1391
+ end
1392
+ end
1393
+ unless author_raw_pair.nil? || author_raw_pair.size == 0
1394
+ @author.name = author_raw_pair.first.strip
1395
+ @author.email = author_raw_pair.last.strip
1396
+ else
1397
+ unless @author.raw.include?("@")
1398
+ # We can be reasonably sure we are looking at something
1399
+ # that the creator didn't intend to contain an email address if
1400
+ # it got through the preceeding regexes and it doesn't
1401
+ # contain the tell-tale '@' symbol.
1402
+ @author.name = @author.raw
1403
+ end
1404
+ end
1405
+ end
1406
+
1407
+ @author.name = nil if @author.name == ""
1408
+ @author.raw = nil if @author.raw == ""
1409
+
1410
+ # Set the author email
1411
+ if @author.email == ""
1412
+ @author.email = FeedTools.unescape_entities(
1413
+ XPath.first(channel_node, "author/email/text()").to_s)
1414
+ end
1415
+ @author.email = nil if @author.email == ""
1416
+
1417
+ # Set the author url
1418
+ @author.url = FeedTools.unescape_entities(
1419
+ XPath.first(channel_node, "author/url/text()").to_s)
1420
+ @author.url = nil if @author.url == ""
1421
+
1422
+ # Fallback on the itunes module if we didn't find an author name
1423
+ begin
1424
+ @author.name = self.itunes_author if @author.name.nil?
1425
+ rescue
1426
+ @author.name = nil
1427
+ end
1428
+ end
1429
+ return @author
1430
+ end
1431
+
1432
+ # Sets the feed author
1433
+ def author=(new_author)
1434
+ if new_author.respond_to?(:name) &&
1435
+ new_author.respond_to?(:email) &&
1436
+ new_author.respond_to?(:url)
1437
+ # It's a complete author object, just set it.
1438
+ @author = new_author
1439
+ else
1440
+ # We're not looking at an author object, this is probably a string,
1441
+ # default to setting the author's name.
1442
+ if @author.nil?
1443
+ @author = FeedTools::Feed::Author.new
1444
+ end
1445
+ @author.name = new_author
1446
+ end
1447
+ end
1448
+
1449
+ # Returns the feed publisher
1450
+ def publisher
1451
+ if @publisher.nil?
1452
+ @publisher = FeedTools::Feed::Author.new
1453
+
1454
+ # Set the author name
1455
+ @publisher.raw = FeedTools.unescape_entities(
1456
+ XPath.first(channel_node, "dc:publisher/text()").to_s)
1457
+ unless @publisher.raw == ""
1458
+ raw_scan = @publisher.raw.scan(
1459
+ /(.*)\((\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\)/i)
1460
+ if raw_scan.nil? || raw_scan.size == 0
1461
+ raw_scan = @publisher.raw.scan(
1462
+ /(\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\s*\((.*)\)/i)
1463
+ unless raw_scan.size == 0
1464
+ publisher_raw_pair = raw_scan.first.reverse
1465
+ end
1466
+ else
1467
+ publisher_raw_pair = raw_scan.first
1468
+ end
1469
+ if raw_scan.nil? || raw_scan.size == 0
1470
+ email_scan = @publisher.raw.scan(
1471
+ /\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b/i)
1472
+ if email_scan != nil && email_scan.size > 0
1473
+ @publisher.email = email_scan.first.strip
1474
+ end
1475
+ end
1476
+ unless publisher_raw_pair.nil? || publisher_raw_pair.size == 0
1477
+ @publisher.name = publisher_raw_pair.first.strip
1478
+ @publisher.email = publisher_raw_pair.last.strip
1479
+ else
1480
+ unless @publisher.raw.include?("@")
1481
+ # We can be reasonably sure we are looking at something
1482
+ # that the creator didn't intend to contain an email address if
1483
+ # it got through the preceeding regexes and it doesn't
1484
+ # contain the tell-tale '@' symbol.
1485
+ @publisher.name = @publisher.raw
1486
+ end
1487
+ end
1488
+ end
1489
+
1490
+ @publisher.name = nil if @publisher.name == ""
1491
+ @publisher.raw = nil if @publisher.raw == ""
1492
+ @publisher.email = nil if @publisher.email == ""
1493
+ @publisher.url = nil if @publisher.url == ""
1494
+ end
1495
+ return @publisher
1496
+ end
1497
+
1498
+ # Sets the feed publisher
1499
+ def publisher=(new_publisher)
1500
+ if new_publisher.respond_to?(:name) &&
1501
+ new_publisher.respond_to?(:email) &&
1502
+ new_publisher.respond_to?(:url)
1503
+ # It's a complete Author object, just set it.
1504
+ @publisher = new_publisher
1505
+ else
1506
+ # We're not looking at an Author object, this is probably a string,
1507
+ # default to setting the publisher's name.
1508
+ if @publisher.nil?
1509
+ @publisher = FeedTools::Feed::Author.new
1510
+ end
1511
+ @publisher.name = new_publisher
1512
+ end
1513
+ end
1514
+
1515
+ # Returns the contents of the itunes:author element
1516
+ #
1517
+ # Returns any incorrectly placed channel-level itunes:author
1518
+ # elements. They're actually amazingly common. People don't read specs.
1519
+ # There is no setter for this, since this is a "bozo" attribute.
1520
+ def itunes_author
1521
+ if @itunes_author.nil?
1522
+ @itunes_author = FeedTools.unescape_entities(XPath.first(channel_node,
1523
+ "itunes:author/text()").to_s)
1524
+ @itunes_author = nil if @itunes_author == ""
1525
+ @bozo = true unless @itunes_author.nil?
1526
+ end
1527
+ return @itunes_author
1528
+ end
1529
+
1530
+ # Returns the feed's copyright information
1531
+ def copyright
1532
+ if @copyright.nil?
1533
+ @copyright = XPath.first(channel_node, "copyright/text()").to_s
1534
+ if @copyright == ""
1535
+ @copyright = XPath.first(channel_node, "dc:rights/text()").to_s
1536
+ end
1537
+ @copyright = FeedTools.sanitize_html(@copyright, :strip)
1538
+ @copyright = nil if @copyright == ""
1539
+ end
1540
+ return @copyright
1541
+ end
1542
+
1543
+ # Sets the feed's copyright information
1544
+ def copyright=(new_copyright)
1545
+ @copyright = new_copyright
1546
+ end
1547
+
1214
1548
  # Returns the number of seconds before the feed should expire
1215
1549
  def time_to_live
1216
1550
  if @time_to_live.nil?
@@ -1275,6 +1609,33 @@ module FeedTools
1275
1609
  @time_to_live = 1 if @time_to_live < 1
1276
1610
  end
1277
1611
 
1612
+ # Returns the feed's cloud
1613
+ def cloud
1614
+ if @cloud.nil?
1615
+ @cloud = FeedTools::Feed::Cloud.new
1616
+ @cloud.domain = XPath.first(channel_node, "cloud/@domain").to_s
1617
+ @cloud.port = XPath.first(channel_node, "cloud/@port").to_s
1618
+ @cloud.path = XPath.first(channel_node, "cloud/@path").to_s
1619
+ @cloud.register_procedure =
1620
+ XPath.first(channel_node, "cloud/@registerProcedure").to_s
1621
+ @cloud.protocol =
1622
+ XPath.first(channel_node, "cloud/@protocol").to_s.downcase
1623
+ @cloud.domain = nil if @cloud.domain == ""
1624
+ @cloud.port = nil if @cloud.port == ""
1625
+ @cloud.port = @cloud.port.to_i unless @cloud.port.nil?
1626
+ @cloud.port = nil if @cloud.port == 0
1627
+ @cloud.path = nil if @cloud.path == ""
1628
+ @cloud.register_procedure = nil if @cloud.register_procedure == ""
1629
+ @cloud.protocol = nil if @cloud.protocol == ""
1630
+ end
1631
+ return @cloud
1632
+ end
1633
+
1634
+ # Sets the feed's cloud
1635
+ def cloud=(new_cloud)
1636
+ @cloud = new_cloud
1637
+ end
1638
+
1278
1639
  # Returns the feed generator
1279
1640
  def generator
1280
1641
  if @generator.nil?
@@ -1454,7 +1815,8 @@ module FeedTools
1454
1815
  end
1455
1816
 
1456
1817
  # Generates xml based on the content of the feed
1457
- def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1818
+ def build_xml(feed_type=(self.feed_type or "rss"), version=0.0,
1819
+ xml_builder=Builder::XmlMarkup.new(:indent => 2))
1458
1820
  if feed_type == "rss" && version == 0.0
1459
1821
  version = 1.0
1460
1822
  elsif feed_type == "atom" && version == 0.0
@@ -1801,22 +2163,47 @@ module FeedTools
1801
2163
  return @root_node
1802
2164
  end
1803
2165
 
2166
+ # Returns the feed items's unique id
2167
+ def id
2168
+ if @id.nil?
2169
+ @id = XPath.first(root_node, "id/text()").to_s
2170
+ if @id == ""
2171
+ @id = XPath.first(root_node, "guid/text()").to_s
2172
+ end
2173
+ @id = nil if @id == ""
2174
+ end
2175
+ return @id
2176
+ end
2177
+
2178
+ # Sets the feed item's unique id
2179
+ def id=(new_id)
2180
+ @id = new_id
2181
+ end
2182
+
1804
2183
  # Returns the feed item title
1805
2184
  def title
1806
2185
  if @title.nil?
2186
+ repair_entities = false
1807
2187
  if XPath.first(root_node, "title/@type").to_s == "xhtml" ||
1808
- XPath.first(root_node, "title/@mode").to_s == "xhtml"
2188
+ XPath.first(root_node, "title/@mode").to_s == "xhtml" ||
2189
+ XPath.first(root_node, "title/@type").to_s == "xml" ||
2190
+ XPath.first(root_node, "title/@mode").to_s == "xml" ||
2191
+ XPath.first(root_node, "title/@type").to_s ==
2192
+ "application/xhtml+xml"
1809
2193
  @title = XPath.first(root_node, "title").inner_xml
1810
2194
  elsif XPath.first(root_node, "title/@type").to_s == "escaped" ||
1811
2195
  XPath.first(root_node, "title/@mode").to_s == "escaped"
1812
- @title = CGI.unescapeHTML(
2196
+ @title = FeedTools.unescape_entities(
1813
2197
  XPath.first(root_node, "title/text()").to_s)
1814
2198
  else
1815
- @title = CGI.unescapeHTML(
1816
- XPath.first(root_node, "title/text()").to_s)
2199
+ title_node = XPath.first(root_node, "title")
2200
+ @title = title_node.inner_xml
2201
+ repair_entities = true
1817
2202
  end
1818
2203
  unless @title.nil?
1819
- @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
2204
+ @title = FeedTools.sanitize_html(@title, :strip)
2205
+ @title = FeedTools.unescape_entities(@title) if repair_entities
2206
+ @title = FeedTools.tidy_html(@title)
1820
2207
  end
1821
2208
  if @title != ""
1822
2209
  # Some blogging tools include the number of comments in a post
@@ -1826,10 +2213,10 @@ module FeedTools
1826
2213
  #
1827
2214
  # If for some incredibly wierd reason you need the actual
1828
2215
  # unstripped title, just use find_node("title/text()").to_s
1829
- @title = FeedTools.strip_html(
1830
- @title.strip.gsub(/\[\d*\]$/, "")).strip
1831
- @title.gsub!(/\n/, " ")
2216
+ @title = @title.strip.gsub(/\[\d*\]$/, "").strip
1832
2217
  end
2218
+ @title.gsub!(/\n/, " ")
2219
+ @title.strip!
1833
2220
  @title = nil if @title == ""
1834
2221
  end
1835
2222
  return @title
@@ -1843,53 +2230,54 @@ module FeedTools
1843
2230
  # Returns the feed item description
1844
2231
  def description
1845
2232
  if @description.nil?
1846
- # get the item content
1847
- @description = ""
1848
- body_node = XPath.first(root_node, "xhtml:body")
1849
- if body_node == nil
1850
- body_node = XPath.first(root_node, "body")
1851
- end
1852
- if body_node != nil
1853
- @description = body_node.inner_xml
1854
- end
1855
- if @description == ""
1856
- @description =
1857
- CGI.unescapeHTML(XPath.first(root_node, "content:encoded/text()").to_s)
1858
- end
1859
- if @description == ""
1860
- begin
1861
- @description = XPath.first(root_node, "description").cdatas.first.to_s
1862
- rescue
1863
- @description = ""
1864
- end
1865
- if @description == ""
1866
- @description = XPath.first(root_node, "description/text()").to_s
1867
- end
1868
- if @description != ""
1869
- if XPath.first(root_node, "description/@encoding").to_s != ""
1870
- # Not supported... yet.
1871
- @description = "[Embedded data objects are not supported.]"
1872
- else
1873
- @description = CGI.unescapeHTML(@description)
1874
- end
1875
- end
1876
- end
1877
- if @description == ""
1878
- @description = XPath.first(root_node, "content/text()").to_s
1879
- if @description != "" &&
1880
- (XPath.first(root_node, "content/@mode").to_s == "escaped" ||
1881
- XPath.first(root_node, "content/@type").to_s == "escaped")
1882
- @description = CGI.unescapeHTML(@description)
1883
- end
1884
- if XPath.first(root_node, "content/@mode").to_s == "xhtml" ||
1885
- XPath.first(root_node, "content/@type").to_s == "xhtml"
1886
- @description = XPath.first(root_node, "content").inner_xml
1887
- end
1888
- end
1889
- if @description == ""
1890
- begin
1891
- @description = XPath.first(root_node, "description").inner_xml
1892
- rescue
2233
+ repair_entities = false
2234
+ description_node = XPath.first(root_node, "description")
2235
+ if description_node.nil?
2236
+ description_node = XPath.first(root_node, "xhtml:body")
2237
+ end
2238
+ if description_node.nil?
2239
+ description_node = XPath.first(root_node, "body")
2240
+ end
2241
+ if description_node.nil?
2242
+ description_node = XPath.first(root_node, "tagline")
2243
+ end
2244
+ if description_node.nil?
2245
+ description_node = XPath.first(root_node, "subtitle")
2246
+ end
2247
+ if description_node.nil?
2248
+ description_node = XPath.first(root_node, "summary")
2249
+ end
2250
+ if description_node.nil?
2251
+ description_node = XPath.first(root_node, "abstract")
2252
+ end
2253
+ if description_node.nil?
2254
+ description_node = XPath.first(root_node, "content:encoded")
2255
+ end
2256
+ if description_node.nil?
2257
+ description_node = XPath.first(root_node, "content")
2258
+ end
2259
+ if description_node.nil?
2260
+ description_node = XPath.first(root_node, "info")
2261
+ @bozo = true unless description_node.nil?
2262
+ end
2263
+ unless description_node.nil?
2264
+ if XPath.first(description_node, "@encoding").to_s != ""
2265
+ @description =
2266
+ "[Embedded data objects are not currently supported.]"
2267
+ elsif XPath.first(description_node, "@type").to_s == "xhtml" ||
2268
+ XPath.first(description_node, "@mode").to_s == "xhtml" ||
2269
+ XPath.first(description_node, "@type").to_s == "xml" ||
2270
+ XPath.first(description_node, "@mode").to_s == "xml" ||
2271
+ XPath.first(description_node, "@type").to_s ==
2272
+ "application/xhtml+xml"
2273
+ @description = description_node.inner_xml
2274
+ elsif XPath.first(description_node, "@type").to_s == "escaped" ||
2275
+ XPath.first(description_node, "@mode").to_s == "escaped"
2276
+ @description = FeedTools.unescape_entities(
2277
+ description_node.inner_xml)
2278
+ else
2279
+ @description = description_node.inner_xml
2280
+ repair_entities = true
1893
2281
  end
1894
2282
  end
1895
2283
  if @description == ""
@@ -1900,20 +2288,13 @@ module FeedTools
1900
2288
  @description = self.itunes_subtitle
1901
2289
  @description = "" if @description.nil?
1902
2290
  end
1903
- if @description == ""
1904
- @description = self.media_text
1905
- @description = "" if @description.nil?
1906
- end
1907
2291
 
1908
2292
  unless @description.nil?
1909
- @description = FeedTools.sanitize_html(@description)
2293
+ @description = FeedTools.sanitize_html(@description, :strip)
2294
+ @description = FeedTools.unescape_entities(@description) if repair_entities
2295
+ @description = FeedTools.tidy_html(@description)
1910
2296
  end
1911
2297
 
1912
- # If it started with a bunch of divs, hack them right off. We can put
1913
- # them back later if they're needed.
1914
- @description.gsub!(/^(<div[^>]*>)*/, "")
1915
- @description.gsub!(/(<\/div>)*$/, "")
1916
-
1917
2298
  @description.gsub!(/\n/, " ") if @description.size < 80
1918
2299
  @description = @description.strip unless @description.nil?
1919
2300
  @description = nil if @description == ""
@@ -1925,6 +2306,66 @@ module FeedTools
1925
2306
  def description=(new_description)
1926
2307
  @description = new_description
1927
2308
  end
2309
+
2310
+ # Returns the contents of the itunes:summary element
2311
+ def itunes_summary
2312
+ if @itunes_summary.nil?
2313
+ @itunes_summary = FeedTools.unescape_entities(XPath.first(root_node,
2314
+ "itunes:summary/text()").to_s)
2315
+ if @itunes_summary == ""
2316
+ @itunes_summary = nil
2317
+ end
2318
+ unless @itunes_summary.nil?
2319
+ @itunes_summary = FeedTools.sanitize_html(@itunes_summary)
2320
+ end
2321
+ end
2322
+ return @itunes_summary
2323
+ end
2324
+
2325
+ # Sets the contents of the itunes:summary element
2326
+ def itunes_summary=(new_itunes_summary)
2327
+ @itunes_summary = new_itunes_summary
2328
+ end
2329
+
2330
+ # Returns the contents of the itunes:subtitle element
2331
+ def itunes_subtitle
2332
+ if @itunes_subtitle.nil?
2333
+ @itunes_subtitle = FeedTools.unescape_entities(XPath.first(root_node,
2334
+ "itunes:subtitle/text()").to_s)
2335
+ if @itunes_subtitle == ""
2336
+ @itunes_subtitle = nil
2337
+ end
2338
+ unless @itunes_subtitle.nil?
2339
+ @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
2340
+ end
2341
+ end
2342
+ return @itunes_subtitle
2343
+ end
2344
+
2345
+ # Sets the contents of the itunes:subtitle element
2346
+ def itunes_subtitle=(new_itunes_subtitle)
2347
+ @itunes_subtitle = new_itunes_subtitle
2348
+ end
2349
+
2350
+ # Returns the contents of the media:text element
2351
+ def media_text
2352
+ if @media_text.nil?
2353
+ @media_text = FeedTools.unescape_entities(XPath.first(root_node,
2354
+ "itunes:subtitle/text()").to_s)
2355
+ if @media_text == ""
2356
+ @media_text = nil
2357
+ end
2358
+ unless @media_text.nil?
2359
+ @media_text = FeedTools.sanitize_html(@media_text)
2360
+ end
2361
+ end
2362
+ return @media_text
2363
+ end
2364
+
2365
+ # Sets the contents of the media:text element
2366
+ def media_text=(new_media_text)
2367
+ @media_text = new_media_text
2368
+ end
1928
2369
 
1929
2370
  # Returns the feed item link
1930
2371
  def link
@@ -1948,7 +2389,7 @@ module FeedTools
1948
2389
  end
1949
2390
  end
1950
2391
  if @link != ""
1951
- @link = CGI.unescapeHTML(@link)
2392
+ @link = FeedTools.unescape_entities(@link)
1952
2393
  end
1953
2394
  if @link != "" && (@link =~ /http:\/\//) != 0 && (@link =~ /https:\/\//) != 0
1954
2395
  if (feed.base[-1..-1] == "/" && @link[0..0] == "/")
@@ -2084,23 +2525,6 @@ module FeedTools
2084
2525
  @media_thumbnail_link = new_media_thumbnail_link
2085
2526
  end
2086
2527
 
2087
- # Returns the feed items's unique id
2088
- def id
2089
- if @id.nil?
2090
- @id = XPath.first(root_node, "id/text()").to_s
2091
- if @id == ""
2092
- @id = XPath.first(root_node, "guid/text()").to_s
2093
- end
2094
- @id = nil if @id == ""
2095
- end
2096
- return @id
2097
- end
2098
-
2099
- # Sets the feed item's unique id
2100
- def id=(new_id)
2101
- @id = new_id
2102
- end
2103
-
2104
2528
  # Returns all feed item enclosures
2105
2529
  def enclosures
2106
2530
  if @enclosures.nil?
@@ -2116,7 +2540,7 @@ module FeedTools
2116
2540
  # sometimes these also manage to show up in atom files.
2117
2541
  for enclosure_node in rss_enclosures
2118
2542
  enclosure = Enclosure.new
2119
- enclosure.url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2543
+ enclosure.url = FeedTools.unescape_entities(enclosure_node.attributes["url"].to_s)
2120
2544
  enclosure.type = enclosure_node.attributes["type"].to_s
2121
2545
  enclosure.file_size = enclosure_node.attributes["length"].to_i
2122
2546
  enclosure.credits = []
@@ -2127,7 +2551,7 @@ module FeedTools
2127
2551
  # Parse atom-type enclosures. If there are repeats of the same enclosure object,
2128
2552
  # we merge the two together.
2129
2553
  for enclosure_node in atom_enclosures
2130
- enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["href"].to_s)
2554
+ enclosure_url = FeedTools.unescape_entities(enclosure_node.attributes["href"].to_s)
2131
2555
  enclosure = nil
2132
2556
  new_enclosure = false
2133
2557
  for existing_enclosure in @enclosures
@@ -2156,7 +2580,7 @@ module FeedTools
2156
2580
  parse_media_content = lambda do |media_content_nodes|
2157
2581
  affected_enclosures = []
2158
2582
  for enclosure_node in media_content_nodes
2159
- enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2583
+ enclosure_url = FeedTools.unescape_entities(enclosure_node.attributes["url"].to_s)
2160
2584
  enclosure = nil
2161
2585
  new_enclosure = false
2162
2586
  for existing_enclosure in @enclosures
@@ -2182,9 +2606,9 @@ module FeedTools
2182
2606
  (enclosure_node.attributes["isDefault"].to_s.downcase == "true")
2183
2607
  if XPath.first(enclosure_node, "media:thumbnail/@url").to_s != ""
2184
2608
  enclosure.thumbnail = EnclosureThumbnail.new(
2185
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@url").to_s),
2186
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@height").to_s),
2187
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@width").to_s)
2609
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:thumbnail/@url").to_s),
2610
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:thumbnail/@height").to_s),
2611
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:thumbnail/@width").to_s)
2188
2612
  )
2189
2613
  if enclosure.thumbnail.height == ""
2190
2614
  enclosure.thumbnail.height = nil
@@ -2196,9 +2620,9 @@ module FeedTools
2196
2620
  enclosure.categories = []
2197
2621
  for category in XPath.match(enclosure_node, "media:category")
2198
2622
  enclosure.categories << EnclosureCategory.new(
2199
- CGI.unescapeHTML(category.text),
2200
- CGI.unescapeHTML(category.attributes["scheme"].to_s),
2201
- CGI.unescapeHTML(category.attributes["label"].to_s)
2623
+ FeedTools.unescape_entities(category.text),
2624
+ FeedTools.unescape_entities(category.attributes["scheme"].to_s),
2625
+ FeedTools.unescape_entities(category.attributes["label"].to_s)
2202
2626
  )
2203
2627
  if enclosure.categories.last.scheme == ""
2204
2628
  enclosure.categories.last.scheme = nil
@@ -2209,16 +2633,16 @@ module FeedTools
2209
2633
  end
2210
2634
  if XPath.first(enclosure_node, "media:hash/text()").to_s != ""
2211
2635
  enclosure.hash = EnclosureHash.new(
2212
- FeedTools.sanitize_html(CGI.unescapeHTML(XPath.first(
2636
+ FeedTools.sanitize_html(FeedTools.unescape_entities(XPath.first(
2213
2637
  enclosure_node, "media:hash/text()").to_s), :strip),
2214
2638
  "md5"
2215
2639
  )
2216
2640
  end
2217
2641
  if XPath.first(enclosure_node, "media:player/@url").to_s != ""
2218
2642
  enclosure.player = EnclosurePlayer.new(
2219
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@url").to_s),
2220
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@height").to_s),
2221
- CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@width").to_s)
2643
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:player/@url").to_s),
2644
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:player/@height").to_s),
2645
+ FeedTools.unescape_entities(XPath.first(enclosure_node, "media:player/@width").to_s)
2222
2646
  )
2223
2647
  if enclosure.player.height == ""
2224
2648
  enclosure.player.height = nil
@@ -2230,8 +2654,8 @@ module FeedTools
2230
2654
  enclosure.credits = []
2231
2655
  for credit in XPath.match(enclosure_node, "media:credit")
2232
2656
  enclosure.credits << EnclosureCredit.new(
2233
- CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2234
- CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2657
+ FeedTools.unescape_entities(credit.text),
2658
+ FeedTools.unescape_entities(credit.attributes["role"].to_s.downcase)
2235
2659
  )
2236
2660
  if enclosure.credits.last.role == ""
2237
2661
  enclosure.credits.last.role = nil
@@ -2240,7 +2664,7 @@ module FeedTools
2240
2664
  enclosure.explicit = (XPath.first(enclosure_node,
2241
2665
  "media:adult/text()").to_s.downcase == "true")
2242
2666
  if XPath.first(enclosure_node, "media:text/text()").to_s != ""
2243
- enclosure.text = CGI.unescapeHTML(XPath.first(enclosure_node,
2667
+ enclosure.text = FeedTools.unescape_entities(XPath.first(enclosure_node,
2244
2668
  "media:text/text()").to_s)
2245
2669
  end
2246
2670
  affected_enclosures << enclosure
@@ -2271,11 +2695,11 @@ module FeedTools
2271
2695
  if enclosure.thumbnail.nil? &&
2272
2696
  XPath.first(media_group, "media:thumbnail/@url").to_s != ""
2273
2697
  enclosure.thumbnail = EnclosureThumbnail.new(
2274
- CGI.unescapeHTML(
2698
+ FeedTools.unescape_entities(
2275
2699
  XPath.first(media_group, "media:thumbnail/@url").to_s),
2276
- CGI.unescapeHTML(
2700
+ FeedTools.unescape_entities(
2277
2701
  XPath.first(media_group, "media:thumbnail/@height").to_s),
2278
- CGI.unescapeHTML(
2702
+ FeedTools.unescape_entities(
2279
2703
  XPath.first(media_group, "media:thumbnail/@width").to_s)
2280
2704
  )
2281
2705
  if enclosure.thumbnail.height == ""
@@ -2289,9 +2713,9 @@ module FeedTools
2289
2713
  enclosure.categories = []
2290
2714
  for category in XPath.match(media_group, "media:category")
2291
2715
  enclosure.categories << EnclosureCategory.new(
2292
- CGI.unescapeHTML(category.text),
2293
- CGI.unescapeHTML(category.attributes["scheme"].to_s),
2294
- CGI.unescapeHTML(category.attributes["label"].to_s)
2716
+ FeedTools.unescape_entities(category.text),
2717
+ FeedTools.unescape_entities(category.attributes["scheme"].to_s),
2718
+ FeedTools.unescape_entities(category.attributes["label"].to_s)
2295
2719
  )
2296
2720
  if enclosure.categories.last.scheme == ""
2297
2721
  enclosure.categories.last.scheme = nil
@@ -2304,16 +2728,16 @@ module FeedTools
2304
2728
  if enclosure.hash.nil? &&
2305
2729
  XPath.first(media_group, "media:hash/text()").to_s != ""
2306
2730
  enclosure.hash = EnclosureHash.new(
2307
- CGI.unescapeHTML(XPath.first(media_group, "media:hash/text()").to_s),
2731
+ FeedTools.unescape_entities(XPath.first(media_group, "media:hash/text()").to_s),
2308
2732
  "md5"
2309
2733
  )
2310
2734
  end
2311
2735
  if enclosure.player.nil? &&
2312
2736
  XPath.first(media_group, "media:player/@url").to_s != ""
2313
2737
  enclosure.player = EnclosurePlayer.new(
2314
- CGI.unescapeHTML(XPath.first(media_group, "media:player/@url").to_s),
2315
- CGI.unescapeHTML(XPath.first(media_group, "media:player/@height").to_s),
2316
- CGI.unescapeHTML(XPath.first(media_group, "media:player/@width").to_s)
2738
+ FeedTools.unescape_entities(XPath.first(media_group, "media:player/@url").to_s),
2739
+ FeedTools.unescape_entities(XPath.first(media_group, "media:player/@height").to_s),
2740
+ FeedTools.unescape_entities(XPath.first(media_group, "media:player/@width").to_s)
2317
2741
  )
2318
2742
  if enclosure.player.height == ""
2319
2743
  enclosure.player.height = nil
@@ -2326,8 +2750,8 @@ module FeedTools
2326
2750
  enclosure.credits = []
2327
2751
  for credit in XPath.match(media_group, "media:credit")
2328
2752
  enclosure.credits << EnclosureCredit.new(
2329
- CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2330
- CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2753
+ FeedTools.unescape_entities(credit.text),
2754
+ FeedTools.unescape_entities(credit.attributes["role"].to_s.downcase)
2331
2755
  )
2332
2756
  if enclosure.credits.last.role == ""
2333
2757
  enclosure.credits.last.role = nil
@@ -2340,7 +2764,7 @@ module FeedTools
2340
2764
  end
2341
2765
  if enclosure.text.nil? &&
2342
2766
  XPath.first(media_group, "media:text/text()").to_s != ""
2343
- enclosure.text = FeedTools.sanitize_html(CGI.unescapeHTML(
2767
+ enclosure.text = FeedTools.sanitize_html(FeedTools.unescape_entities(
2344
2768
  XPath.first(media_group, "media:text/text()").to_s), :strip)
2345
2769
  end
2346
2770
  end
@@ -2373,9 +2797,9 @@ module FeedTools
2373
2797
  enclosure.categories = []
2374
2798
  end
2375
2799
  enclosure.categories << EnclosureCategory.new(
2376
- CGI.unescapeHTML(category_path),
2377
- CGI.unescapeHTML("http://www.apple.com/itunes/store/"),
2378
- CGI.unescapeHTML("iTunes Music Store Categories")
2800
+ FeedTools.unescape_entities(category_path),
2801
+ FeedTools.unescape_entities("http://www.apple.com/itunes/store/"),
2802
+ FeedTools.unescape_entities("iTunes Music Store Categories")
2379
2803
  )
2380
2804
  end
2381
2805
  end
@@ -2464,136 +2888,140 @@ module FeedTools
2464
2888
  def enclosures=(new_enclosures)
2465
2889
  @enclosures = new_enclosures
2466
2890
  end
2467
-
2468
- # Returns the feed item author
2469
- def author_name
2470
- # TODO: make this not suck, actually ensure we're looking at a name
2471
- # and not an email address.
2472
- # Also, factor in itunes module.
2473
- # =================================================================
2474
- if @author_name.nil?
2475
- @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/name/text()").to_s)
2476
- if @author_name == ""
2477
- @author_name = CGI.unescapeHTML(XPath.first(root_node, "dc:creator/text()").to_s)
2478
- end
2479
- if @author_name == ""
2480
- @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/text()").to_s)
2481
- end
2482
- end
2483
- return @author_name
2484
- end
2485
2891
 
2486
- # Sets the feed item author
2487
- def author_name=(new_author_name)
2488
- @author_name = new_author_name
2489
- end
2490
-
2491
- # Returns the contents of the itunes:summary element
2492
- def itunes_summary
2493
- if @itunes_summary.nil?
2494
- @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
2495
- "itunes:summary/text()").to_s)
2496
- if @itunes_summary == ""
2497
- @itunes_summary = nil
2498
- end
2499
- unless @itunes_summary.nil?
2500
- @itunes_summary = FeedTools.sanitize_html(@itunes_summary)
2892
+ # Returns the feed item author
2893
+ def author
2894
+ if @author.nil?
2895
+ @author = FeedTools::Feed::Author.new
2896
+
2897
+ # Set the author name
2898
+ @author.name = FeedTools.unescape_entities(
2899
+ XPath.first(root_node, "author/name/text()").to_s)
2900
+
2901
+ @author.raw = FeedTools.unescape_entities(
2902
+ XPath.first(root_node, "author/text()").to_s)
2903
+ if @author.raw == ""
2904
+ @author.raw = FeedTools.unescape_entities(
2905
+ XPath.first(root_node, "dc:creator/text()").to_s)
2906
+ end
2907
+ if @author.raw == ""
2908
+ @author.raw = FeedTools.unescape_entities(
2909
+ XPath.first(root_node, "dc:author/text()").to_s)
2910
+ end
2911
+ unless @author.raw == ""
2912
+ raw_scan = @author.raw.scan(
2913
+ /(.*)\((\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\)/i)
2914
+ if raw_scan.nil? || raw_scan.size == 0
2915
+ raw_scan = @author.raw.scan(
2916
+ /(\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)\s*\((.*)\)/i)
2917
+ author_raw_pair = raw_scan.first.reverse unless raw_scan.size == 0
2918
+ else
2919
+ author_raw_pair = raw_scan.first
2920
+ end
2921
+ if raw_scan.nil? || raw_scan.size == 0
2922
+ email_scan = @author.raw.scan(
2923
+ /\b[A-Z0-9._%-\+]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b/i)
2924
+ if email_scan != nil && email_scan.size > 0
2925
+ @author.email = email_scan.first.strip
2926
+ end
2927
+ end
2928
+ unless author_raw_pair.nil? || author_raw_pair.size == 0
2929
+ @author.name = author_raw_pair.first.strip
2930
+ @author.email = author_raw_pair.last.strip
2931
+ else
2932
+ unless @author.raw.include?("@")
2933
+ # We can be reasonably sure we are looking at something
2934
+ # that the creator didn't intend to contain an email address if
2935
+ # it got through the preceeding regexes and it doesn't
2936
+ # contain the tell-tale '@' symbol.
2937
+ @author.name = @author.raw
2938
+ end
2939
+ end
2501
2940
  end
2502
- end
2503
- return @itunes_summary
2504
- end
2505
2941
 
2506
- # Sets the contents of the itunes:summary element
2507
- def itunes_summary=(new_itunes_summary)
2508
- @itunes_summary = new_itunes_summary
2509
- end
2942
+ @author.name = nil if @author.name == ""
2943
+ @author.raw = nil if @author.raw == ""
2510
2944
 
2511
- # Returns the contents of the itunes:subtitle element
2512
- def itunes_subtitle
2513
- if @itunes_subtitle.nil?
2514
- @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
2515
- "itunes:subtitle/text()").to_s)
2516
- if @itunes_subtitle == ""
2517
- @itunes_subtitle = nil
2945
+ # Set the author email
2946
+ if @author.email == ""
2947
+ @author.email = FeedTools.unescape_entities(
2948
+ XPath.first(root_node, "author/email/text()").to_s)
2518
2949
  end
2519
- unless @itunes_subtitle.nil?
2520
- @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
2521
- end
2522
- end
2523
- return @itunes_subtitle
2524
- end
2950
+ @author.email = nil if @author.email == ""
2525
2951
 
2526
- # Sets the contents of the itunes:subtitle element
2527
- def itunes_subtitle=(new_itunes_subtitle)
2528
- @itunes_subtitle = new_itunes_subtitle
2529
- end
2952
+ # Set the author url
2953
+ @author.url = FeedTools.unescape_entities(
2954
+ XPath.first(root_node, "author/url/text()").to_s)
2955
+ @author.url = nil if @author.url == ""
2530
2956
 
2531
- # Returns the contents of the media:text element
2532
- def media_text
2533
- if @media_text.nil?
2534
- @media_text = CGI.unescapeHTML(XPath.first(root_node,
2535
- "itunes:subtitle/text()").to_s)
2536
- if @media_text == ""
2537
- @media_text = nil
2538
- end
2539
- unless @media_text.nil?
2540
- @media_text = FeedTools.sanitize_html(@media_text)
2957
+ # Fallback on the itunes module if we didn't find an author name
2958
+ begin
2959
+ @author.name = self.itunes_author if @author.name.nil?
2960
+ rescue
2961
+ @author.name = nil
2541
2962
  end
2542
2963
  end
2543
- return @media_text
2964
+ return @author
2544
2965
  end
2545
-
2546
- # Sets the contents of the media:text element
2547
- def media_text=(new_media_text)
2548
- @media_text = new_media_text
2966
+
2967
+ # Sets the feed item author
2968
+ def author=(new_author)
2969
+ if new_author.respond_to?(:name) &&
2970
+ new_author.respond_to?(:email) &&
2971
+ new_author.respond_to?(:url)
2972
+ # It's a complete author object, just set it.
2973
+ @author = new_author
2974
+ else
2975
+ # We're not looking at an author object, this is probably a string,
2976
+ # default to setting the author's name.
2977
+ if @author.nil?
2978
+ @author = FeedTools::Feed::Author.new
2979
+ end
2980
+ @author.name = new_author
2981
+ end
2549
2982
  end
2550
2983
 
2551
2984
  # Returns the contents of the itunes:author element
2552
2985
  #
2553
2986
  # This inherits from any incorrectly placed channel-level itunes:author
2554
- # elements. They're actually amazingly commong. People don't read specs.
2987
+ # elements. They're actually amazingly common. People don't read specs.
2555
2988
  def itunes_author
2556
2989
  if @itunes_author.nil?
2557
- @itunes_author = CGI.unescapeHTML(XPath.first(root_node,
2990
+ @itunes_author = FeedTools.unescape_entities(XPath.first(root_node,
2558
2991
  "itunes:author/text()").to_s)
2559
- if @itunes_author == ""
2560
- @itunes_author = CGI.unescapeHTML(XPath.first(feed.channel_node,
2561
- "itunes:author/text()").to_s)
2562
- end
2563
- if @itunes_author == ""
2564
- @itunes_author = nil
2565
- end
2992
+ @itunes_author = feed.itunes_author if @itunes_author == ""
2993
+ @itunes_author = nil if @itunes_author == ""
2566
2994
  end
2567
2995
  return @itunes_author
2568
2996
  end
2569
-
2997
+
2570
2998
  # Sets the contents of the itunes:author element
2571
2999
  def itunes_author=(new_itunes_author)
2572
3000
  @itunes_author = new_itunes_author
2573
- end
2574
-
3001
+ end
3002
+
2575
3003
  # Returns the number of seconds that the associated media runs for
2576
- def duration
2577
- if @duration.nil?
2578
- itunes_duration = CGI.unescapeHTML(XPath.first(root_node,
3004
+ def itunes_duration
3005
+ if @itunes_duration.nil?
3006
+ raw_duration = FeedTools.unescape_entities(XPath.first(root_node,
2579
3007
  "itunes:duration/text()").to_s)
2580
- if itunes_duration != ""
2581
- hms = itunes_duration.split(":").map { |x| x.to_i }
3008
+ if raw_duration != ""
3009
+ hms = raw_duration.split(":").map { |x| x.to_i }
2582
3010
  if hms.size == 3
2583
- @duration = hms[0].hour + hms[1].minute + hms[2]
3011
+ @itunes_duration = hms[0].hour + hms[1].minute + hms[2]
2584
3012
  elsif hms.size == 2
2585
- @duration = hms[0].minute + hms[1]
3013
+ @itunes_duration = hms[0].minute + hms[1]
2586
3014
  elsif hms.size == 1
2587
- @duration = hms[0]
3015
+ @itunes_duration = hms[0]
2588
3016
  end
2589
3017
  end
2590
3018
  end
2591
- return @duration
3019
+ return @itunes_duration
2592
3020
  end
2593
3021
 
2594
3022
  # Sets the number of seconds that the associate media runs for
2595
- def duration=(new_duration)
2596
- @duration = new_duration
3023
+ def itunes_duration=(new_itunes_duration)
3024
+ @itunes_duration = new_itunes_duration
2597
3025
  end
2598
3026
 
2599
3027
  # Sets the itunes:summary
@@ -2722,7 +3150,8 @@ module FeedTools
2722
3150
  end
2723
3151
 
2724
3152
  # Generates xml based on the content of the feed item
2725
- def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
3153
+ def build_xml(feed_type=(self.feed.feed_type or "rss"), version=0.0,
3154
+ xml_builder=Builder::XmlMarkup.new(:indent => 2))
2726
3155
  if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
2727
3156
  # RDF-based rss format
2728
3157
  if link.nil?
@@ -2831,9 +3260,9 @@ module FeedTools
2831
3260
  end
2832
3261
  end
2833
3262
 
2834
- module REXML #:nodoc:
2835
- class Element #:nodoc:
2836
- def inner_xml #:nodoc:
3263
+ module REXML # :nodoc:
3264
+ class Element # :nodoc:
3265
+ def inner_xml # :nodoc:
2837
3266
  result = ""
2838
3267
  self.each_child do |child|
2839
3268
  result << child.to_s
@@ -2848,4 +3277,4 @@ begin
2848
3277
  FeedTools.feed_cache.initialize_cache
2849
3278
  end
2850
3279
  rescue
2851
- end
3280
+ end