feedme 0.8.2 → 0.8.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -1
- data/examples/rocketboom.rb +9 -5
- data/lib/feedme.rb +42 -9
- data/lib/hpricot-util.rb +1 -0
- data/lib/html-cleaner.rb +3 -1
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,4 +1,13 @@
|
|
1
|
-
=== 0.8.
|
1
|
+
=== 0.8.3 / 2010-05-27
|
2
|
+
|
3
|
+
* Attempt to recognize incomplete RSS documents with a simpler regular
|
4
|
+
expression, since the full one causes FeedMe to hang on large documents.
|
5
|
+
* Fix NPE in html-cleaner.
|
6
|
+
* Add concept of value selectors: when calling the singular accessor for a tag
|
7
|
+
that has multiple instance, a value selector chooses which to return. Provide a
|
8
|
+
default value selector for link tags.
|
9
|
+
|
10
|
+
=== 0.8.2 / 2010-01-14
|
2
11
|
|
3
12
|
* Remove VERSION variable from feedme.rb
|
4
13
|
* Don't double-escape CDATA-escaped content
|
data/examples/rocketboom.rb
CHANGED
@@ -14,13 +14,14 @@ def fetch(url)
|
|
14
14
|
end
|
15
15
|
|
16
16
|
# read from a file
|
17
|
-
content = ""
|
18
|
-
File.open('
|
19
|
-
content = file.read
|
20
|
-
end
|
17
|
+
#content = ""
|
18
|
+
#File.open('bad.rss', "r") do |file|
|
19
|
+
# content = file.read
|
20
|
+
#end
|
21
21
|
|
22
22
|
# read from a url
|
23
23
|
#content = fetch('http://www.rocketboom.com/rss/hd.xml')
|
24
|
+
content = fetch('http://failbooking.com/feed/')
|
24
25
|
|
25
26
|
# create a new ParserBuilder
|
26
27
|
builder = FeedMe::ParserBuilder.new
|
@@ -45,7 +46,10 @@ rss.items.each do |item|
|
|
45
46
|
puts "Categories: #{item.category_array.join(', ')}" if item.category_array?
|
46
47
|
# ! causes value to be modified according to prior specifications
|
47
48
|
# ? checks for the presense of a tag/attribute
|
48
|
-
puts "Description:\n#{item.description
|
49
|
+
puts "Description:\n#{item.description}" if item.description?
|
49
50
|
# we can access attribute values just as easily as tag content
|
50
51
|
puts "Enclosure: #{item.enclosure.url}" if item.enclosure?
|
52
|
+
loc = 'media:content'
|
53
|
+
mc = item.call_virtual_method("#{loc}_values".to_sym)
|
54
|
+
puts mc.join(',')
|
51
55
|
end
|
data/lib/feedme.rb
CHANGED
@@ -17,6 +17,9 @@ module FeedMe
|
|
17
17
|
NOKOGIRI_HELPER = 'nokogiri-util.rb'
|
18
18
|
HPRICOT_HELPER = 'hpricot-util.rb'
|
19
19
|
|
20
|
+
# default rels to accept, in order of preference
|
21
|
+
DEFAULT_RELS = [ 'self', 'alternate', 'enclosure', 'related', 'edit', 'replies', 'via' ]
|
22
|
+
|
20
23
|
# Parse a feed using the promiscuous parser.
|
21
24
|
def FeedMe.parse(source, options={})
|
22
25
|
ParserBuilder.new(options).parse(source)
|
@@ -46,6 +49,9 @@ module FeedMe
|
|
46
49
|
attr_accessor :value_tags
|
47
50
|
# Tags to use for element value when specific tag isn't specified
|
48
51
|
attr_accessor :default_value_tags
|
52
|
+
# A hash of functions for selecting the correct value to return when a tags
|
53
|
+
# has multiple values and the singluar accessor is called
|
54
|
+
attr_accessor :value_selectors
|
49
55
|
# A hash of attribute/tag name aliases.
|
50
56
|
attr_accessor :aliases
|
51
57
|
# An array of the transformation functions applied when the !
|
@@ -106,6 +112,19 @@ module FeedMe
|
|
106
112
|
}
|
107
113
|
@default_value_tags = [ CONTENT_KEY, :href, :url ]
|
108
114
|
|
115
|
+
# methods for selecting the element to return when the singular accessor
|
116
|
+
# is called on a tag with multiple values
|
117
|
+
@value_selectors = {
|
118
|
+
:link => proc do |links|
|
119
|
+
links = links.sort do |a,b|
|
120
|
+
i1 = DEFAULT_RELS.index(a.rel)
|
121
|
+
i2 = DEFAULT_RELS.index(b.rel)
|
122
|
+
i1.nil? ? (i2.nil? ? 0 : 1) : (i2.nil? ? -1 : i1 <=> i2)
|
123
|
+
end
|
124
|
+
links.first
|
125
|
+
end
|
126
|
+
}
|
127
|
+
|
109
128
|
# tag/attribute aliases
|
110
129
|
@aliases = {
|
111
130
|
:items => :item_array,
|
@@ -209,7 +228,7 @@ module FeedMe
|
|
209
228
|
end
|
210
229
|
end
|
211
230
|
|
212
|
-
#
|
231
|
+
# This class is used to create strict parsers
|
213
232
|
class StrictParserBuilder < ParserBuilder
|
214
233
|
attr_accessor :feed_ext_tags, :item_ext_tags, :rels
|
215
234
|
|
@@ -268,7 +287,7 @@ module FeedMe
|
|
268
287
|
]
|
269
288
|
|
270
289
|
@rels = {
|
271
|
-
:link =>
|
290
|
+
:link => DEFAULT_RELS
|
272
291
|
}
|
273
292
|
|
274
293
|
# extensions
|
@@ -370,7 +389,9 @@ module FeedMe
|
|
370
389
|
# 1. Tag/attribute name: since tags/attributes are stored as arrays,
|
371
390
|
# the instance variable name is the tag/attribute name followed by
|
372
391
|
# '_array'. The tag/attribute name is actually a virtual method that
|
373
|
-
# returns the first element in the array.
|
392
|
+
# returns the first element in the array. If a Proc is passed as the first
|
393
|
+
# argument and the array has more than one element, the Proc is used to sort
|
394
|
+
# the array before returning the first element.
|
374
395
|
# 2. Aliases: for tags/attributes with aliases, the alias is a virtual
|
375
396
|
# method that simply forwards to the aliased method.
|
376
397
|
# 3. Any name that ends with a '?' returns true if the name without
|
@@ -401,7 +422,15 @@ module FeedMe
|
|
401
422
|
result = if key? name
|
402
423
|
self[name]
|
403
424
|
elsif key? array_key
|
404
|
-
self[array_key]
|
425
|
+
array = self[array_key]
|
426
|
+
elt = if array.size > 1
|
427
|
+
if (!args.empty? && args.first.is_a?(Proc))
|
428
|
+
args.first.call(array)
|
429
|
+
elsif (fm_builder.value_sorters.key?(name))
|
430
|
+
value_selectors[name].call(array)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
elt || array.first
|
405
434
|
elsif name_str[-1,1] == '?'
|
406
435
|
!call_virtual_method(name_str[0..-2], args, history).nil? rescue false
|
407
436
|
elsif name_str[-1,1] == '!'
|
@@ -503,9 +532,11 @@ module FeedMe
|
|
503
532
|
|
504
533
|
trans = fm_builder.transformation_fns[t_name] or
|
505
534
|
raise NoMethodError.new("No such transformation #{t_name}", t_name)
|
506
|
-
|
535
|
+
|
507
536
|
if value.is_a? Array
|
508
|
-
value = value.collect
|
537
|
+
value = value.collect do |x|
|
538
|
+
x.nil? ? nil : trans.call(x, *args)
|
539
|
+
end.compact
|
509
540
|
else
|
510
541
|
value = trans.call(value, *args)
|
511
542
|
end
|
@@ -556,8 +587,11 @@ module FeedMe
|
|
556
587
|
private
|
557
588
|
|
558
589
|
def parse
|
559
|
-
# RSS = everything between channel tags + everthing between </channel> and
|
560
|
-
if
|
590
|
+
# RSS = everything between channel tags + everthing between </channel> and
|
591
|
+
# </rdf> if this is an RDF document. Do a simpler match to begin with
|
592
|
+
# since the more complex regexp will hang on a large and invalid document.
|
593
|
+
if @fm_source =~ %r{<(?:.*?:)?channel.+</(?:.*?:)?channel}mi &&
|
594
|
+
@fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
|
561
595
|
@fm_type = $1.upcase.to_s
|
562
596
|
@fm_tags = fm_builder.all_rss_tags
|
563
597
|
attrs = parse_attributes($1, $2 + $3)
|
@@ -655,7 +689,6 @@ module FeedMe
|
|
655
689
|
end
|
656
690
|
|
657
691
|
@fm_unparsed += elements.keys
|
658
|
-
|
659
692
|
@fm_parsed.uniq!
|
660
693
|
@fm_unparsed.uniq!
|
661
694
|
end
|
data/lib/hpricot-util.rb
CHANGED
data/lib/html-cleaner.rb
CHANGED
@@ -58,6 +58,7 @@ module FeedMe
|
|
58
58
|
#
|
59
59
|
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
60
|
def clean(str)
|
61
|
+
return nil if str.nil?
|
61
62
|
str = unescapeHTML(str)
|
62
63
|
doc = Hpricot(str, :fixup_tags => true)
|
63
64
|
doc = subtree(doc, :body)
|
@@ -137,7 +138,8 @@ module FeedMe
|
|
137
138
|
end
|
138
139
|
|
139
140
|
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
140
|
-
def unescapeHTML(str, xml
|
141
|
+
def unescapeHTML(str, xml=true)
|
142
|
+
return nil if str.nil?
|
141
143
|
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
142
144
|
end
|
143
145
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedme
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Didion
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-05-27 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|