feedme 0.8.2 → 0.8.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,13 @@
1
- === 0.8.2
1
+ === 0.8.3 / 2010-05-27
2
+
3
+ * Attempt to recognize incomplete RSS documents with a simpler regular
4
+ expression, since the full one causes FeedMe to hang on large documents.
5
+ * Fix NPE in html-cleaner.
6
+ * Add concept of value selectors: when calling the singular accessor for a tag
7
+ that has multiple instance, a value selector chooses which to return. Provide a
8
+ default value selector for link tags.
9
+
10
+ === 0.8.2 / 2010-01-14
2
11
 
3
12
  * Remove VERSION variable from feedme.rb
4
13
  * Don't double-escape CDATA-escaped content
@@ -14,13 +14,14 @@ def fetch(url)
14
14
  end
15
15
 
16
16
  # read from a file
17
- content = ""
18
- File.open('rocketboom.rss', "r") do |file|
19
- content = file.read
20
- end
17
+ #content = ""
18
+ #File.open('bad.rss', "r") do |file|
19
+ # content = file.read
20
+ #end
21
21
 
22
22
  # read from a url
23
23
  #content = fetch('http://www.rocketboom.com/rss/hd.xml')
24
+ content = fetch('http://failbooking.com/feed/')
24
25
 
25
26
  # create a new ParserBuilder
26
27
  builder = FeedMe::ParserBuilder.new
@@ -45,7 +46,10 @@ rss.items.each do |item|
45
46
  puts "Categories: #{item.category_array.join(', ')}" if item.category_array?
46
47
  # ! causes value to be modified according to prior specifications
47
48
  # ? checks for the presense of a tag/attribute
48
- puts "Description:\n#{item.description!}" if item.description?
49
+ puts "Description:\n#{item.description}" if item.description?
49
50
  # we can access attribute values just as easily as tag content
50
51
  puts "Enclosure: #{item.enclosure.url}" if item.enclosure?
52
+ loc = 'media:content'
53
+ mc = item.call_virtual_method("#{loc}_values".to_sym)
54
+ puts mc.join(',')
51
55
  end
@@ -17,6 +17,9 @@ module FeedMe
17
17
  NOKOGIRI_HELPER = 'nokogiri-util.rb'
18
18
  HPRICOT_HELPER = 'hpricot-util.rb'
19
19
 
20
+ # default rels to accept, in order of preference
21
+ DEFAULT_RELS = [ 'self', 'alternate', 'enclosure', 'related', 'edit', 'replies', 'via' ]
22
+
20
23
  # Parse a feed using the promiscuous parser.
21
24
  def FeedMe.parse(source, options={})
22
25
  ParserBuilder.new(options).parse(source)
@@ -46,6 +49,9 @@ module FeedMe
46
49
  attr_accessor :value_tags
47
50
  # Tags to use for element value when specific tag isn't specified
48
51
  attr_accessor :default_value_tags
52
+ # A hash of functions for selecting the correct value to return when a tags
53
+ # has multiple values and the singluar accessor is called
54
+ attr_accessor :value_selectors
49
55
  # A hash of attribute/tag name aliases.
50
56
  attr_accessor :aliases
51
57
  # An array of the transformation functions applied when the !
@@ -106,6 +112,19 @@ module FeedMe
106
112
  }
107
113
  @default_value_tags = [ CONTENT_KEY, :href, :url ]
108
114
 
115
+ # methods for selecting the element to return when the singular accessor
116
+ # is called on a tag with multiple values
117
+ @value_selectors = {
118
+ :link => proc do |links|
119
+ links = links.sort do |a,b|
120
+ i1 = DEFAULT_RELS.index(a.rel)
121
+ i2 = DEFAULT_RELS.index(b.rel)
122
+ i1.nil? ? (i2.nil? ? 0 : 1) : (i2.nil? ? -1 : i1 <=> i2)
123
+ end
124
+ links.first
125
+ end
126
+ }
127
+
109
128
  # tag/attribute aliases
110
129
  @aliases = {
111
130
  :items => :item_array,
@@ -209,7 +228,7 @@ module FeedMe
209
228
  end
210
229
  end
211
230
 
212
- #
231
+ # This class is used to create strict parsers
213
232
  class StrictParserBuilder < ParserBuilder
214
233
  attr_accessor :feed_ext_tags, :item_ext_tags, :rels
215
234
 
@@ -268,7 +287,7 @@ module FeedMe
268
287
  ]
269
288
 
270
289
  @rels = {
271
- :link => [ 'self', 'alternate', 'edit', 'replies', 'related', 'enclosure', 'via' ]
290
+ :link => DEFAULT_RELS
272
291
  }
273
292
 
274
293
  # extensions
@@ -370,7 +389,9 @@ module FeedMe
370
389
  # 1. Tag/attribute name: since tags/attributes are stored as arrays,
371
390
  # the instance variable name is the tag/attribute name followed by
372
391
  # '_array'. The tag/attribute name is actually a virtual method that
373
- # returns the first element in the array.
392
+ # returns the first element in the array. If a Proc is passed as the first
393
+ # argument and the array has more than one element, the Proc is used to sort
394
+ # the array before returning the first element.
374
395
  # 2. Aliases: for tags/attributes with aliases, the alias is a virtual
375
396
  # method that simply forwards to the aliased method.
376
397
  # 3. Any name that ends with a '?' returns true if the name without
@@ -401,7 +422,15 @@ module FeedMe
401
422
  result = if key? name
402
423
  self[name]
403
424
  elsif key? array_key
404
- self[array_key].first
425
+ array = self[array_key]
426
+ elt = if array.size > 1
427
+ if (!args.empty? && args.first.is_a?(Proc))
428
+ args.first.call(array)
429
+ elsif (fm_builder.value_sorters.key?(name))
430
+ value_selectors[name].call(array)
431
+ end
432
+ end
433
+ elt || array.first
405
434
  elsif name_str[-1,1] == '?'
406
435
  !call_virtual_method(name_str[0..-2], args, history).nil? rescue false
407
436
  elsif name_str[-1,1] == '!'
@@ -503,9 +532,11 @@ module FeedMe
503
532
 
504
533
  trans = fm_builder.transformation_fns[t_name] or
505
534
  raise NoMethodError.new("No such transformation #{t_name}", t_name)
506
-
535
+
507
536
  if value.is_a? Array
508
- value = value.collect {|x| trans.call(x, *args) }
537
+ value = value.collect do |x|
538
+ x.nil? ? nil : trans.call(x, *args)
539
+ end.compact
509
540
  else
510
541
  value = trans.call(value, *args)
511
542
  end
@@ -556,8 +587,11 @@ module FeedMe
556
587
  private
557
588
 
558
589
  def parse
559
- # RSS = everything between channel tags + everthing between </channel> and </rdf> if this is an RDF document
560
- if @fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
590
+ # RSS = everything between channel tags + everthing between </channel> and
591
+ # </rdf> if this is an RDF document. Do a simpler match to begin with
592
+ # since the more complex regexp will hang on a large and invalid document.
593
+ if @fm_source =~ %r{<(?:.*?:)?channel.+</(?:.*?:)?channel}mi &&
594
+ @fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
561
595
  @fm_type = $1.upcase.to_s
562
596
  @fm_tags = fm_builder.all_rss_tags
563
597
  attrs = parse_attributes($1, $2 + $3)
@@ -655,7 +689,6 @@ module FeedMe
655
689
  end
656
690
 
657
691
  @fm_unparsed += elements.keys
658
-
659
692
  @fm_parsed.uniq!
660
693
  @fm_unparsed.uniq!
661
694
  end
@@ -30,6 +30,7 @@ module FeedMe
30
30
  # sanitize HTML
31
31
  # todo: dup code to fix bugs
32
32
  def clean_html(html)
33
+ return nil if html.nil?
33
34
  FeedMe::HtmlCleaner.clean(html)
34
35
  end
35
36
  end
@@ -58,6 +58,7 @@ module FeedMe
58
58
  #
59
59
  # Extra (i.e. unmatched) ending tags and comments are removed.
60
60
  def clean(str)
61
+ return nil if str.nil?
61
62
  str = unescapeHTML(str)
62
63
  doc = Hpricot(str, :fixup_tags => true)
63
64
  doc = subtree(doc, :body)
@@ -137,7 +138,8 @@ module FeedMe
137
138
  end
138
139
 
139
140
  # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
140
- def unescapeHTML(str, xml = true)
141
+ def unescapeHTML(str, xml=true)
142
+ return nil if str.nil?
141
143
  CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
142
144
  end
143
145
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedme
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Didion
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-14 00:00:00 -05:00
12
+ date: 2010-05-27 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies: []
15
15