feedme 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -1
- data/examples/rocketboom.rb +9 -5
- data/lib/feedme.rb +42 -9
- data/lib/hpricot-util.rb +1 -0
- data/lib/html-cleaner.rb +3 -1
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,4 +1,13 @@
|
|
1
|
-
=== 0.8.
|
1
|
+
=== 0.8.3 / 2010-05-27
|
2
|
+
|
3
|
+
* Attempt to recognize incomplete RSS documents with a simpler regular
|
4
|
+
expression, since the full one causes FeedMe to hang on large documents.
|
5
|
+
* Fix NPE in html-cleaner.
|
6
|
+
* Add concept of value selectors: when calling the singular accessor for a tag
|
7
|
+
that has multiple instance, a value selector chooses which to return. Provide a
|
8
|
+
default value selector for link tags.
|
9
|
+
|
10
|
+
=== 0.8.2 / 2010-01-14
|
2
11
|
|
3
12
|
* Remove VERSION variable from feedme.rb
|
4
13
|
* Don't double-escape CDATA-escaped content
|
data/examples/rocketboom.rb
CHANGED
@@ -14,13 +14,14 @@ def fetch(url)
|
|
14
14
|
end
|
15
15
|
|
16
16
|
# read from a file
|
17
|
-
content = ""
|
18
|
-
File.open('
|
19
|
-
content = file.read
|
20
|
-
end
|
17
|
+
#content = ""
|
18
|
+
#File.open('bad.rss', "r") do |file|
|
19
|
+
# content = file.read
|
20
|
+
#end
|
21
21
|
|
22
22
|
# read from a url
|
23
23
|
#content = fetch('http://www.rocketboom.com/rss/hd.xml')
|
24
|
+
content = fetch('http://failbooking.com/feed/')
|
24
25
|
|
25
26
|
# create a new ParserBuilder
|
26
27
|
builder = FeedMe::ParserBuilder.new
|
@@ -45,7 +46,10 @@ rss.items.each do |item|
|
|
45
46
|
puts "Categories: #{item.category_array.join(', ')}" if item.category_array?
|
46
47
|
# ! causes value to be modified according to prior specifications
|
47
48
|
# ? checks for the presense of a tag/attribute
|
48
|
-
puts "Description:\n#{item.description
|
49
|
+
puts "Description:\n#{item.description}" if item.description?
|
49
50
|
# we can access attribute values just as easily as tag content
|
50
51
|
puts "Enclosure: #{item.enclosure.url}" if item.enclosure?
|
52
|
+
loc = 'media:content'
|
53
|
+
mc = item.call_virtual_method("#{loc}_values".to_sym)
|
54
|
+
puts mc.join(',')
|
51
55
|
end
|
data/lib/feedme.rb
CHANGED
@@ -17,6 +17,9 @@ module FeedMe
|
|
17
17
|
NOKOGIRI_HELPER = 'nokogiri-util.rb'
|
18
18
|
HPRICOT_HELPER = 'hpricot-util.rb'
|
19
19
|
|
20
|
+
# default rels to accept, in order of preference
|
21
|
+
DEFAULT_RELS = [ 'self', 'alternate', 'enclosure', 'related', 'edit', 'replies', 'via' ]
|
22
|
+
|
20
23
|
# Parse a feed using the promiscuous parser.
|
21
24
|
def FeedMe.parse(source, options={})
|
22
25
|
ParserBuilder.new(options).parse(source)
|
@@ -46,6 +49,9 @@ module FeedMe
|
|
46
49
|
attr_accessor :value_tags
|
47
50
|
# Tags to use for element value when specific tag isn't specified
|
48
51
|
attr_accessor :default_value_tags
|
52
|
+
# A hash of functions for selecting the correct value to return when a tags
|
53
|
+
# has multiple values and the singluar accessor is called
|
54
|
+
attr_accessor :value_selectors
|
49
55
|
# A hash of attribute/tag name aliases.
|
50
56
|
attr_accessor :aliases
|
51
57
|
# An array of the transformation functions applied when the !
|
@@ -106,6 +112,19 @@ module FeedMe
|
|
106
112
|
}
|
107
113
|
@default_value_tags = [ CONTENT_KEY, :href, :url ]
|
108
114
|
|
115
|
+
# methods for selecting the element to return when the singular accessor
|
116
|
+
# is called on a tag with multiple values
|
117
|
+
@value_selectors = {
|
118
|
+
:link => proc do |links|
|
119
|
+
links = links.sort do |a,b|
|
120
|
+
i1 = DEFAULT_RELS.index(a.rel)
|
121
|
+
i2 = DEFAULT_RELS.index(b.rel)
|
122
|
+
i1.nil? ? (i2.nil? ? 0 : 1) : (i2.nil? ? -1 : i1 <=> i2)
|
123
|
+
end
|
124
|
+
links.first
|
125
|
+
end
|
126
|
+
}
|
127
|
+
|
109
128
|
# tag/attribute aliases
|
110
129
|
@aliases = {
|
111
130
|
:items => :item_array,
|
@@ -209,7 +228,7 @@ module FeedMe
|
|
209
228
|
end
|
210
229
|
end
|
211
230
|
|
212
|
-
#
|
231
|
+
# This class is used to create strict parsers
|
213
232
|
class StrictParserBuilder < ParserBuilder
|
214
233
|
attr_accessor :feed_ext_tags, :item_ext_tags, :rels
|
215
234
|
|
@@ -268,7 +287,7 @@ module FeedMe
|
|
268
287
|
]
|
269
288
|
|
270
289
|
@rels = {
|
271
|
-
:link =>
|
290
|
+
:link => DEFAULT_RELS
|
272
291
|
}
|
273
292
|
|
274
293
|
# extensions
|
@@ -370,7 +389,9 @@ module FeedMe
|
|
370
389
|
# 1. Tag/attribute name: since tags/attributes are stored as arrays,
|
371
390
|
# the instance variable name is the tag/attribute name followed by
|
372
391
|
# '_array'. The tag/attribute name is actually a virtual method that
|
373
|
-
# returns the first element in the array.
|
392
|
+
# returns the first element in the array. If a Proc is passed as the first
|
393
|
+
# argument and the array has more than one element, the Proc is used to sort
|
394
|
+
# the array before returning the first element.
|
374
395
|
# 2. Aliases: for tags/attributes with aliases, the alias is a virtual
|
375
396
|
# method that simply forwards to the aliased method.
|
376
397
|
# 3. Any name that ends with a '?' returns true if the name without
|
@@ -401,7 +422,15 @@ module FeedMe
|
|
401
422
|
result = if key? name
|
402
423
|
self[name]
|
403
424
|
elsif key? array_key
|
404
|
-
self[array_key]
|
425
|
+
array = self[array_key]
|
426
|
+
elt = if array.size > 1
|
427
|
+
if (!args.empty? && args.first.is_a?(Proc))
|
428
|
+
args.first.call(array)
|
429
|
+
elsif (fm_builder.value_sorters.key?(name))
|
430
|
+
value_selectors[name].call(array)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
elt || array.first
|
405
434
|
elsif name_str[-1,1] == '?'
|
406
435
|
!call_virtual_method(name_str[0..-2], args, history).nil? rescue false
|
407
436
|
elsif name_str[-1,1] == '!'
|
@@ -503,9 +532,11 @@ module FeedMe
|
|
503
532
|
|
504
533
|
trans = fm_builder.transformation_fns[t_name] or
|
505
534
|
raise NoMethodError.new("No such transformation #{t_name}", t_name)
|
506
|
-
|
535
|
+
|
507
536
|
if value.is_a? Array
|
508
|
-
value = value.collect
|
537
|
+
value = value.collect do |x|
|
538
|
+
x.nil? ? nil : trans.call(x, *args)
|
539
|
+
end.compact
|
509
540
|
else
|
510
541
|
value = trans.call(value, *args)
|
511
542
|
end
|
@@ -556,8 +587,11 @@ module FeedMe
|
|
556
587
|
private
|
557
588
|
|
558
589
|
def parse
|
559
|
-
# RSS = everything between channel tags + everthing between </channel> and
|
560
|
-
if
|
590
|
+
# RSS = everything between channel tags + everthing between </channel> and
|
591
|
+
# </rdf> if this is an RDF document. Do a simpler match to begin with
|
592
|
+
# since the more complex regexp will hang on a large and invalid document.
|
593
|
+
if @fm_source =~ %r{<(?:.*?:)?channel.+</(?:.*?:)?channel}mi &&
|
594
|
+
@fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
|
561
595
|
@fm_type = $1.upcase.to_s
|
562
596
|
@fm_tags = fm_builder.all_rss_tags
|
563
597
|
attrs = parse_attributes($1, $2 + $3)
|
@@ -655,7 +689,6 @@ module FeedMe
|
|
655
689
|
end
|
656
690
|
|
657
691
|
@fm_unparsed += elements.keys
|
658
|
-
|
659
692
|
@fm_parsed.uniq!
|
660
693
|
@fm_unparsed.uniq!
|
661
694
|
end
|
data/lib/hpricot-util.rb
CHANGED
data/lib/html-cleaner.rb
CHANGED
@@ -58,6 +58,7 @@ module FeedMe
|
|
58
58
|
#
|
59
59
|
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
60
|
def clean(str)
|
61
|
+
return nil if str.nil?
|
61
62
|
str = unescapeHTML(str)
|
62
63
|
doc = Hpricot(str, :fixup_tags => true)
|
63
64
|
doc = subtree(doc, :body)
|
@@ -137,7 +138,8 @@ module FeedMe
|
|
137
138
|
end
|
138
139
|
|
139
140
|
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
140
|
-
def unescapeHTML(str, xml
|
141
|
+
def unescapeHTML(str, xml=true)
|
142
|
+
return nil if str.nil?
|
141
143
|
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
142
144
|
end
|
143
145
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedme
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Didion
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-05-27 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|