feedme 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,13 @@
1
- === 0.8.2
1
+ === 0.8.3 / 2010-05-27
2
+
3
+ * Attempt to recognize incomplete RSS documents with a simpler regular
4
+ expression, since the full one causes FeedMe to hang on large documents.
5
+ * Fix NPE in html-cleaner.
6
+ * Add concept of value selectors: when calling the singular accessor for a tag
7
+ that has multiple instance, a value selector chooses which to return. Provide a
8
+ default value selector for link tags.
9
+
10
+ === 0.8.2 / 2010-01-14
2
11
 
3
12
  * Remove VERSION variable from feedme.rb
4
13
  * Don't double-escape CDATA-escaped content
@@ -14,13 +14,14 @@ def fetch(url)
14
14
  end
15
15
 
16
16
  # read from a file
17
- content = ""
18
- File.open('rocketboom.rss', "r") do |file|
19
- content = file.read
20
- end
17
+ #content = ""
18
+ #File.open('bad.rss', "r") do |file|
19
+ # content = file.read
20
+ #end
21
21
 
22
22
  # read from a url
23
23
  #content = fetch('http://www.rocketboom.com/rss/hd.xml')
24
+ content = fetch('http://failbooking.com/feed/')
24
25
 
25
26
  # create a new ParserBuilder
26
27
  builder = FeedMe::ParserBuilder.new
@@ -45,7 +46,10 @@ rss.items.each do |item|
45
46
  puts "Categories: #{item.category_array.join(', ')}" if item.category_array?
46
47
  # ! causes value to be modified according to prior specifications
47
48
  # ? checks for the presense of a tag/attribute
48
- puts "Description:\n#{item.description!}" if item.description?
49
+ puts "Description:\n#{item.description}" if item.description?
49
50
  # we can access attribute values just as easily as tag content
50
51
  puts "Enclosure: #{item.enclosure.url}" if item.enclosure?
52
+ loc = 'media:content'
53
+ mc = item.call_virtual_method("#{loc}_values".to_sym)
54
+ puts mc.join(',')
51
55
  end
@@ -17,6 +17,9 @@ module FeedMe
17
17
  NOKOGIRI_HELPER = 'nokogiri-util.rb'
18
18
  HPRICOT_HELPER = 'hpricot-util.rb'
19
19
 
20
+ # default rels to accept, in order of preference
21
+ DEFAULT_RELS = [ 'self', 'alternate', 'enclosure', 'related', 'edit', 'replies', 'via' ]
22
+
20
23
  # Parse a feed using the promiscuous parser.
21
24
  def FeedMe.parse(source, options={})
22
25
  ParserBuilder.new(options).parse(source)
@@ -46,6 +49,9 @@ module FeedMe
46
49
  attr_accessor :value_tags
47
50
  # Tags to use for element value when specific tag isn't specified
48
51
  attr_accessor :default_value_tags
52
+ # A hash of functions for selecting the correct value to return when a tags
53
+ # has multiple values and the singluar accessor is called
54
+ attr_accessor :value_selectors
49
55
  # A hash of attribute/tag name aliases.
50
56
  attr_accessor :aliases
51
57
  # An array of the transformation functions applied when the !
@@ -106,6 +112,19 @@ module FeedMe
106
112
  }
107
113
  @default_value_tags = [ CONTENT_KEY, :href, :url ]
108
114
 
115
+ # methods for selecting the element to return when the singular accessor
116
+ # is called on a tag with multiple values
117
+ @value_selectors = {
118
+ :link => proc do |links|
119
+ links = links.sort do |a,b|
120
+ i1 = DEFAULT_RELS.index(a.rel)
121
+ i2 = DEFAULT_RELS.index(b.rel)
122
+ i1.nil? ? (i2.nil? ? 0 : 1) : (i2.nil? ? -1 : i1 <=> i2)
123
+ end
124
+ links.first
125
+ end
126
+ }
127
+
109
128
  # tag/attribute aliases
110
129
  @aliases = {
111
130
  :items => :item_array,
@@ -209,7 +228,7 @@ module FeedMe
209
228
  end
210
229
  end
211
230
 
212
- #
231
+ # This class is used to create strict parsers
213
232
  class StrictParserBuilder < ParserBuilder
214
233
  attr_accessor :feed_ext_tags, :item_ext_tags, :rels
215
234
 
@@ -268,7 +287,7 @@ module FeedMe
268
287
  ]
269
288
 
270
289
  @rels = {
271
- :link => [ 'self', 'alternate', 'edit', 'replies', 'related', 'enclosure', 'via' ]
290
+ :link => DEFAULT_RELS
272
291
  }
273
292
 
274
293
  # extensions
@@ -370,7 +389,9 @@ module FeedMe
370
389
  # 1. Tag/attribute name: since tags/attributes are stored as arrays,
371
390
  # the instance variable name is the tag/attribute name followed by
372
391
  # '_array'. The tag/attribute name is actually a virtual method that
373
- # returns the first element in the array.
392
+ # returns the first element in the array. If a Proc is passed as the first
393
+ # argument and the array has more than one element, the Proc is used to sort
394
+ # the array before returning the first element.
374
395
  # 2. Aliases: for tags/attributes with aliases, the alias is a virtual
375
396
  # method that simply forwards to the aliased method.
376
397
  # 3. Any name that ends with a '?' returns true if the name without
@@ -401,7 +422,15 @@ module FeedMe
401
422
  result = if key? name
402
423
  self[name]
403
424
  elsif key? array_key
404
- self[array_key].first
425
+ array = self[array_key]
426
+ elt = if array.size > 1
427
+ if (!args.empty? && args.first.is_a?(Proc))
428
+ args.first.call(array)
429
+ elsif (fm_builder.value_sorters.key?(name))
430
+ value_selectors[name].call(array)
431
+ end
432
+ end
433
+ elt || array.first
405
434
  elsif name_str[-1,1] == '?'
406
435
  !call_virtual_method(name_str[0..-2], args, history).nil? rescue false
407
436
  elsif name_str[-1,1] == '!'
@@ -503,9 +532,11 @@ module FeedMe
503
532
 
504
533
  trans = fm_builder.transformation_fns[t_name] or
505
534
  raise NoMethodError.new("No such transformation #{t_name}", t_name)
506
-
535
+
507
536
  if value.is_a? Array
508
- value = value.collect {|x| trans.call(x, *args) }
537
+ value = value.collect do |x|
538
+ x.nil? ? nil : trans.call(x, *args)
539
+ end.compact
509
540
  else
510
541
  value = trans.call(value, *args)
511
542
  end
@@ -556,8 +587,11 @@ module FeedMe
556
587
  private
557
588
 
558
589
  def parse
559
- # RSS = everything between channel tags + everthing between </channel> and </rdf> if this is an RDF document
560
- if @fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
590
+ # RSS = everything between channel tags + everthing between </channel> and
591
+ # </rdf> if this is an RDF document. Do a simpler match to begin with
592
+ # since the more complex regexp will hang on a large and invalid document.
593
+ if @fm_source =~ %r{<(?:.*?:)?channel.+</(?:.*?:)?channel}mi &&
594
+ @fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
561
595
  @fm_type = $1.upcase.to_s
562
596
  @fm_tags = fm_builder.all_rss_tags
563
597
  attrs = parse_attributes($1, $2 + $3)
@@ -655,7 +689,6 @@ module FeedMe
655
689
  end
656
690
 
657
691
  @fm_unparsed += elements.keys
658
-
659
692
  @fm_parsed.uniq!
660
693
  @fm_unparsed.uniq!
661
694
  end
@@ -30,6 +30,7 @@ module FeedMe
30
30
  # sanitize HTML
31
31
  # todo: dup code to fix bugs
32
32
  def clean_html(html)
33
+ return nil if html.nil?
33
34
  FeedMe::HtmlCleaner.clean(html)
34
35
  end
35
36
  end
@@ -58,6 +58,7 @@ module FeedMe
58
58
  #
59
59
  # Extra (i.e. unmatched) ending tags and comments are removed.
60
60
  def clean(str)
61
+ return nil if str.nil?
61
62
  str = unescapeHTML(str)
62
63
  doc = Hpricot(str, :fixup_tags => true)
63
64
  doc = subtree(doc, :body)
@@ -137,7 +138,8 @@ module FeedMe
137
138
  end
138
139
 
139
140
  # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
140
- def unescapeHTML(str, xml = true)
141
+ def unescapeHTML(str, xml=true)
142
+ return nil if str.nil?
141
143
  CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
142
144
  end
143
145
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedme
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Didion
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-14 00:00:00 -05:00
12
+ date: 2010-05-27 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies: []
15
15