feedme 0.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +100 -0
- data/Manifest.txt +2 -0
- data/README.txt +124 -46
- data/Rakefile +21 -4
- data/examples/rocketboom.rb +5 -4
- data/lib/feedme.rb +399 -191
- data/lib/hpricot-util.rb +82 -0
- data/lib/html-cleaner.rb +188 -0
- data/lib/nokogiri-util.rb +117 -0
- data/lib/util.rb +45 -0
- metadata +12 -16
data/History.txt
CHANGED
@@ -1,3 +1,103 @@
|
|
1
|
+
=== 0.8 / 2009-12-14
|
2
|
+
|
3
|
+
* Add new virtual method _values: returns all values for a given tag.
|
4
|
+
* Transformations with arguments are now specified as an array rather than
|
5
|
+
part of the symbol
|
6
|
+
* Add transform method
|
7
|
+
* Add regexp transform
|
8
|
+
* Add nokogiri support (hpricot is still the default)
|
9
|
+
* Copy/paste and fix feed-normalizer clean_html method, drop feed-normalizer dependency
|
10
|
+
|
11
|
+
=== 0.7.1 / 2009-09-24
|
12
|
+
|
13
|
+
* Fix nil_or_empty? to strip whitespace from strings
|
14
|
+
|
15
|
+
=== 0.7 / 2009-09-24
|
16
|
+
|
17
|
+
* Design decision: all element and attribute names will be stored as lower-case. They may still
|
18
|
+
be accessed using upper case, since keys will be normalized by all accessors.
|
19
|
+
* Design decision: RDF will be dealt with at parse time: elements with rdf:resource attributes will be
|
20
|
+
replaced by the actual, referenced elements. Ordering of the referring elements will be preserved.
|
21
|
+
* Removed the concept of ghost tags.
|
22
|
+
|
23
|
+
=== 0.6.5 / 2009-09-24
|
24
|
+
|
25
|
+
* Fix :truncHtml completely by requiring active_support.
|
26
|
+
|
27
|
+
=== 0.6.4 / 2009-09-23
|
28
|
+
|
29
|
+
* Roll version to make github happy.
|
30
|
+
|
31
|
+
=== 0.6.3 / 2009-09-23
|
32
|
+
|
33
|
+
* Fix truncHtml: use code by Henrik Nyh, which in turn uses Hypricot
|
34
|
+
|
35
|
+
=== 0.6.2 / 2009-09-23
|
36
|
+
|
37
|
+
* Fix content-parsing regular expression to correctly handle closed elements
|
38
|
+
* Reverse earlier design decision: keep namespaces for attributes.
|
39
|
+
|
40
|
+
=== 0.6.1 / 2009-09-23
|
41
|
+
|
42
|
+
* Improve handling of rdf:items. From now on, .items will forward to .item_array. The rdf items can still be accessed by [:items_array] or .items_array.
|
43
|
+
|
44
|
+
=== 0.6 / 2009-09-23
|
45
|
+
|
46
|
+
* Fix handling of the items element (mostly affects RSS 1.0 documents)
|
47
|
+
* Make attribute naming consistent
|
48
|
+
* Design decision: attributes can only ever have a single value, so they will always be stored as scalars
|
49
|
+
rather than arrays. This will also nicely resolve any possible collisions between attribute and tag names.
|
50
|
+
|
51
|
+
=== 0.5.4 / 2009-09-22
|
52
|
+
|
53
|
+
* Minor improvements to to_indented_s
|
54
|
+
* Fix tag names: change all tags with namespaces to the cleaned version (unquote, ':' replaced with '_')
|
55
|
+
* Design decision: all attribute names will have their namespaces stripped; namespaces are generally
|
56
|
+
treated as optional (even if they aren't technically so) and it's annoying to have to check both forms;
|
57
|
+
this decision may be reversed if there are found to be conflicts
|
58
|
+
|
59
|
+
=== 0.5.3 / 2009-09-22
|
60
|
+
|
61
|
+
* Roll version to test GitHub wierdness.
|
62
|
+
|
63
|
+
=== 0.5.2 / 2009-09-22
|
64
|
+
|
65
|
+
* Improve to_s method for prettier array display.
|
66
|
+
|
67
|
+
=== 0.5.1 / 2009-09-21
|
68
|
+
|
69
|
+
* Update example code
|
70
|
+
* Bug fix: call_virtual_method has invalid return if neither a key nor any of its aliases has a value
|
71
|
+
* Subsequent releases will follow standard versioning model of "major.minor.bugfix"
|
72
|
+
|
73
|
+
=== 0.5 / 2009-09-21
|
74
|
+
|
75
|
+
* Special handling for atom id tag
|
76
|
+
* to_indented_str method, which creates a pretty output for a FeedData
|
77
|
+
* Improved to_s method that delegates to to_indented_str
|
78
|
+
|
79
|
+
=== 0.4 / 2009-09-20
|
80
|
+
|
81
|
+
* Expose call_virtual_method as public
|
82
|
+
* Change 'name' argument of call_virtual_method to 'sym'
|
83
|
+
* Add default value for call_virtual_method 'args' argument
|
84
|
+
* Add :'media:content' and :'content:encoded' as ext tags
|
85
|
+
* fix use of FeedNormalizer in :cleanHtml transformation
|
86
|
+
|
87
|
+
=== 0.3 / 2009-09-18
|
88
|
+
|
89
|
+
* Update example code
|
90
|
+
* Bug fix: call_virtual_method always throws exception
|
91
|
+
* Bug fix: responds_to? -> respond_to? and rels -> :rels
|
92
|
+
|
93
|
+
=== 0.2 / 2009-09-12
|
94
|
+
|
95
|
+
* Change bang mods to more flexible transformations framework.
|
96
|
+
* Add additional transformation functions.
|
97
|
+
* Add methods for RSS/Atom emulation that automatically add appropriate aliases.
|
98
|
+
* Add empty_string_for_nil and error_on_missing_key options.
|
99
|
+
* Add support for parsing only certain rels in the strict parser.
|
100
|
+
|
1
101
|
=== 0.1 / 2009-09-03
|
2
102
|
|
3
103
|
* Everything is new. First release.
|
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
= feedme
|
2
2
|
|
3
|
-
* http://
|
3
|
+
* http://wiki.github.com/jdidion/feedme
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
@@ -24,76 +24,143 @@ The API is similar to SimpleRSS:
|
|
24
24
|
require 'open-uri'
|
25
25
|
|
26
26
|
rss = FeedMe.parse open('http://slashdot.org/index.rdf')
|
27
|
-
|
28
|
-
rss.version # => 1.0
|
27
|
+
rss.version # => 1.0
|
29
28
|
rss.channel.title # => "Slashdot"
|
30
29
|
rss.channel.link # => "http://slashdot.org/"
|
31
30
|
rss.items.first.link # => "http://books.slashdot.org/article.pl?sid=05/08/29/1319236&from=rss"
|
32
31
|
|
33
|
-
But since the parser can read Atom feeds as easily as RSS feeds, there are
|
32
|
+
But since the parser can read Atom feeds as easily as RSS feeds, there are aliases that allow more atom like reading:
|
34
33
|
|
35
34
|
rss.feed.title # => "Slashdot"
|
36
35
|
rss.feed.link # => "http://slashdot.org/"
|
37
36
|
rss.entries.first.link # => "http://books.slashdot.org/article.pl?sid=05/08/29/1319236&from=rss"
|
38
|
-
|
39
|
-
Under the covers, all
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
|
38
|
+
Under the covers, all element values are stored in arrays. This means that you can access all content for an element that appears multiple times (i.e. category):
|
39
|
+
|
40
|
+
rss.items.first.category_array # => ["News for Nerds", "Technology"]
|
41
|
+
rss.items.first.category # => "News for Nerds"
|
42
|
+
|
44
43
|
You also have access to all the attributes as well as tag values:
|
45
44
|
|
46
|
-
|
47
|
-
|
45
|
+
rss.items.first.guid.isPermaLink # => "true"
|
46
|
+
rss.items.first.guid.content # => http://books.slashdot.org/article.pl?sid=05/08/29/1319236
|
48
47
|
|
49
48
|
FeedMe also adds some syntactic sugar that makes it easy to get the information you want:
|
50
49
|
|
51
|
-
|
52
|
-
|
53
|
-
|
50
|
+
rss.items.first.category? # => true
|
51
|
+
rss.items.first.category_count # => 2
|
52
|
+
rss.items.first.guid_value # => http://books.slashdot.org/article.pl?sid=05/08/29/1319236
|
54
53
|
|
55
54
|
There are two different parsers that you can use, depending on your needs. The default parser is "promiscuous," meaning that it parses all tags. There is also a strict parser that only parses tags specified in a list. Here is how you create the different types of parsers:
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
55
|
+
|
56
|
+
FeedMe.parse(source) # parse using the default (promiscuous) parser
|
57
|
+
FeedMe::ParserBuilder.new.parse(source) # equivalent to the previous line
|
58
|
+
FeedMe.parse_strict(source)
|
59
|
+
FeedMe::StrictParserBuilder.new.parse(source) # only parse certain tags
|
60
|
+
|
61
|
+
The FeedMe class methods and the parser builder constructors also accept an options hash. Options are also passed on to the Parser constructor. Currently, only two options are available:
|
62
|
+
|
63
|
+
1. :empty_string_for_nil => false # return the empty string instead of a nil value
|
64
|
+
2. :error_on_missing_key => false # raise an error if a specified key or virtual method does not exist (otherwise nil is returned)
|
60
65
|
|
61
66
|
The strict parser can be extended by adding new tags to parse:
|
62
67
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
+
builder = FeedMe::StrictParserBuilder.new
|
69
|
+
builder.rss_tags << :some_new_tag
|
70
|
+
builder.rss_item_tags << :'item+myrel' # parse an item that has a custom rel type
|
71
|
+
builder.item_ext_tags << :feedburner_origLink # parse an extension tag - one that has a specific
|
72
|
+
# namespace (use '_', not ':', to separate namespace
|
73
|
+
# from attribute name)
|
74
|
+
|
68
75
|
Either parser can be extended by adding aliases to existing tags:
|
69
76
|
|
70
|
-
|
77
|
+
builder.aliases[:updated] => :pubDate # now you can always access the updated date using :updated,
|
78
|
+
# regardless of whether it's an RSS or Atom feed
|
79
|
+
|
80
|
+
If you don't know ahead of time what type of feed you'll be parsing, you can tell FeedMe to always emulate RSS or Atom. These methods just add a bunch of aliases:
|
81
|
+
|
82
|
+
builder.emulate_rss!
|
83
|
+
builder.emulate_atom!
|
84
|
+
|
85
|
+
Another bit of syntactic sugar are transformations. These are modifications that can be applied to feed content. There is a default transformation that can be applied by adding '!' to the tag name.
|
86
|
+
|
87
|
+
rss.entry.content # => <div>Some great stuff</div>
|
88
|
+
rss.entry.content! # => Some great stuff
|
89
|
+
|
90
|
+
The default transformation can be changed:
|
91
|
+
|
92
|
+
builder.default_transformation = [ :cleanHtml ]
|
93
|
+
|
94
|
+
Custom transformations are defined by mapping one or more transformation functions to a suffix:
|
95
|
+
|
96
|
+
builder.transformations['clean'] = [ :cleanHtml ]
|
97
|
+
|
98
|
+
rss.entry.content # => <div>This is a bunch of text</div><p></p></html>
|
99
|
+
rss.entry.content_clean # => <div>This is a bunch of text</div>
|
100
|
+
|
101
|
+
You can also/instead apply an arbitrary set of transformations via the transform method:
|
71
102
|
|
72
|
-
|
103
|
+
rss.entry.transform(:content, [ :clean, [ :trunc, 50 ] ])
|
73
104
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
105
|
+
You can create your own transformation function. The following is an example of a transformation function that takes an argument. Note that transformation function names may only contain alphanumeric characters. Argument values are specified at the end separated by underscores.
|
106
|
+
|
107
|
+
builder.transformation_fns[:wrap] => proc {|str, col|
|
108
|
+
str.gsub(/(.{1,#{col}})( +|$\n?)|(.{1,#{col}})/, "\\1\\3\n").strip
|
109
|
+
}
|
110
|
+
builder.transformations['wrap'] = [ :wrap_10 ]
|
111
|
+
|
112
|
+
rss.entry.content = This is a bunch of text
|
113
|
+
rss.entry.content_wrap = This is a
|
114
|
+
bunch of
|
115
|
+
text
|
78
116
|
|
79
|
-
|
80
|
-
builder.bang_mod_fns[:wrap] => proc {|str, col| str.gsub(/(.{1,#{col}})( +|$\n?)|(.{1,#{col}})/, "\\1\\3\n").strip }
|
81
|
-
builder.bang_mods << :wrap_80
|
117
|
+
The transformation functions available by default are:
|
82
118
|
|
119
|
+
1. :stripHtml - described above
|
120
|
+
2. :cleanHtml - ** Requires FeedNormalizer (which in turn requires Hypricot) **
|
121
|
+
|
122
|
+
rss.entry_array[0].content # => 1 > 2
|
123
|
+
rss.entry_array[0].content! # => 1 > 2
|
124
|
+
|
125
|
+
rss.entry_array[1].content # => <div>Some great stuff</div><p></p></html>
|
126
|
+
rss.entry_array[1].content! # => <div>Some great stuff</div>
|
127
|
+
|
128
|
+
3. :wrap - takes number of columns as a parameter. Respects word boundaries. Example of :wrap_10:
|
129
|
+
|
130
|
+
rss.entry.content # => This is a bunch of text
|
131
|
+
rss.entry.content! # => This is a
|
132
|
+
bunch of
|
133
|
+
text
|
134
|
+
|
135
|
+
4. :trunc - truncates text to a certain length. Example of :trunc_10:
|
136
|
+
|
137
|
+
rss.entries.first.content # => This is a long long long sentence
|
138
|
+
rss.entries.first.content! # => This is a
|
139
|
+
|
140
|
+
5. :truncHtml - truncates the content inside the first set of HTML tags, but preserves the tags. ** Requires ActiveSupport and Hpricot ** Example of :truncHtml_10:
|
141
|
+
|
142
|
+
rss.entries.first.content # => <div>This is a long long long sentence</div></html>
|
143
|
+
rss.entries.first.content! # => <div>This is a </div></html>
|
144
|
+
|
145
|
+
6. :regexp - apply a regular expression and extract the capture groups
|
146
|
+
|
147
|
+
rss.entries.first.content # => This is a long long long entry
|
148
|
+
rss.entries.first.transform(:content, [ :regexp, /(This is a long ).*(entry)/ ]) # => This is a long entry
|
149
|
+
|
83
150
|
In order to prevent clashes between tag/attribute names and the parser class' instance variables, all instance variables are prefixed with 'fm_'. They are:
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
151
|
+
|
152
|
+
fm_source # the original, unparsed source
|
153
|
+
fm_options # the options passed to the parser constructor
|
154
|
+
fm_type # the feed type
|
155
|
+
fm_tags # the tags the parser looks for in the source
|
156
|
+
fm_parsed # the list of tags the parser actually found
|
157
|
+
fm_unparsed # the list of tags that appeared in the feed but were not parsed (useful for debugging)
|
91
158
|
|
92
159
|
Additionally, there are several variables that are available at every level of the parse tree:
|
93
160
|
|
94
|
-
|
95
|
-
|
96
|
-
|
161
|
+
fm_builder # the ParserBuilder that created the parser
|
162
|
+
fm_parent # the container of the current level of the parse tree
|
163
|
+
fm_tag_name # the name of the rss/atom tag whose content is contained in this level of the tree
|
97
164
|
|
98
165
|
=== A word on RSS/Atom Versions
|
99
166
|
|
@@ -107,9 +174,20 @@ Due to various incompatibilities between different RSS versions, it is strongly
|
|
107
174
|
|
108
175
|
== INSTALL:
|
109
176
|
|
110
|
-
* gem install feedme
|
111
|
-
* http://
|
177
|
+
* gem install jdidion-feedme (Add GitHub as a gem source: gem sources -a http://gems.github.com)
|
178
|
+
* http://github.com/jdidion/feedme/downloads
|
179
|
+
|
180
|
+
To use certain features of FeedMe, some dependencies are required:
|
181
|
+
* To use the :truncHtml transformation for truncating HTML content, ActiveSupport and Hpricot are required
|
182
|
+
|
183
|
+
sudo gem install activesupport
|
184
|
+
sudo gem install hpricot
|
185
|
+
|
186
|
+
* To use the :cleanHtml for sanitizing HTML, FeedNormalizer and Hpricot are required
|
187
|
+
|
188
|
+
sudo gem install feed-normalizer
|
189
|
+
sudo gem install hpricot
|
112
190
|
|
113
191
|
== LICENSE:
|
114
192
|
|
115
|
-
This work is licensed under the Creative Commons Attribution 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/us/ or send a letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.
|
193
|
+
This work is licensed under the Creative Commons Attribution 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/us/ or send a letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.
|
data/Rakefile
CHANGED
@@ -1,7 +1,24 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require '
|
2
|
+
require 'jeweler'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
tasks = Jeweler::Tasks.new do |s|
|
5
|
+
s.name = "feedme"
|
6
|
+
s.authors = ["John Didion"]
|
7
|
+
s.description = %q{A simple, flexible, and extensible RSS and Atom parser for Ruby. Based on the popular SimpleRSS library, but with many nice extra features.}
|
8
|
+
s.email = ["code@didion.net"]
|
9
|
+
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt"]
|
10
|
+
s.files = ["History.txt", "Manifest.txt", "README.txt", "Rakefile",
|
11
|
+
"lib/feedme.rb", "lib/hpricot-util.rb", "lib/nokogiri-util.rb",
|
12
|
+
"lib/html-cleaner.rb", "lib/util.rb", "examples/rocketboom.rb",
|
13
|
+
"examples/rocketboom.rss", "test/test_helper.rb"]
|
14
|
+
s.homepage = %q{http://wiki.github.com/jdidion/feedme}
|
15
|
+
s.rdoc_options = ["--main", "README.txt"]
|
16
|
+
s.require_paths = ["lib"]
|
17
|
+
s.rubyforge_project = %q{feedme}
|
18
|
+
s.summary = %q{A simple, flexible, and extensible RSS and Atom parser for Ruby}
|
19
|
+
s.test_files = ["test/test_helper.rb"]
|
7
20
|
end
|
21
|
+
tasks.jeweler.remote = 'github'
|
22
|
+
Jeweler::GemcutterTasks.new
|
23
|
+
|
24
|
+
|
data/examples/rocketboom.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
|
2
|
-
require '
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'feedme'
|
3
4
|
require 'net/http'
|
4
5
|
|
5
6
|
def fetch(url)
|
@@ -24,13 +25,13 @@ end
|
|
24
25
|
# create a new ParserBuilder
|
25
26
|
builder = FeedMe::ParserBuilder.new
|
26
27
|
# add a bang mod to wrap content to 50 columns
|
27
|
-
builder.
|
28
|
+
builder.default_transformation << :wrap_80
|
28
29
|
|
29
30
|
# parse the rss feed
|
30
31
|
rss = builder.parse(content)
|
31
32
|
|
32
33
|
# equivalent to rss.channel.title
|
33
|
-
puts "#{rss.
|
34
|
+
puts "#{rss.class} Feed: #{rss.title}"
|
34
35
|
|
35
36
|
# use a virtual method...this one a shortcut to rss.items.size
|
36
37
|
puts "#{rss.item_count} items"
|
data/lib/feedme.rb
CHANGED
@@ -1,54 +1,84 @@
|
|
1
|
-
####################################################################################
|
2
|
-
# FeedMe v0.1
|
3
|
-
#
|
4
|
-
# FeedMe is an easy to use parser for RSS and Atom files. It is based on SimpleRSS,
|
5
|
-
# but has some improvements that make it worth considering:
|
6
|
-
# 1. Support for attributes
|
7
|
-
# 2. Support for nested elements
|
8
|
-
# 3. Support for elements that appear multiple times
|
9
|
-
# 4. Syntactic sugar that makes it easier to get at the information you want
|
10
|
-
#
|
11
|
-
# One word of caution: FeedMe will be maintained only so long as SimpleRSS does not
|
12
|
-
# provide the above features. I will try to keep FeedMe's API compatible with
|
13
|
-
# SimpleRSS so that it will be easy for users to switch if/when necessary.
|
14
|
-
####################################################################################
|
15
|
-
|
16
1
|
require 'cgi'
|
17
2
|
require 'time'
|
3
|
+
require 'util.rb'
|
18
4
|
|
19
5
|
module FeedMe
|
20
|
-
|
6
|
+
# The current version of FeedMe.
|
7
|
+
VERSION = "0.7.2"
|
21
8
|
|
22
|
-
#
|
9
|
+
# The value of Parser#fm_type for RSS feeds.
|
23
10
|
RSS = :RSS
|
11
|
+
# The value of Parser#fm_type for RDF (RSS 1.0) feeds.
|
12
|
+
RDF = :RDF
|
13
|
+
# The value of Parser#fm_type for Atom feeds.
|
24
14
|
ATOM = :ATOM
|
25
15
|
|
26
|
-
#
|
16
|
+
# The key used to access the content element of a mixed tag.
|
27
17
|
CONTENT_KEY = :content
|
28
18
|
|
19
|
+
# Helper libraries for HTML functions
|
20
|
+
NOKOGIRI_HELPER = 'nokogiri-util.rb'
|
21
|
+
HPRICOT_HELPER = 'hpricot-util.rb'
|
22
|
+
|
23
|
+
# Parse a feed using the promiscuous parser.
|
29
24
|
def FeedMe.parse(source, options={})
|
30
|
-
ParserBuilder.new.parse(source
|
25
|
+
ParserBuilder.new(options).parse(source)
|
31
26
|
end
|
32
27
|
|
28
|
+
# Parse a feed using the strict parser.
|
33
29
|
def FeedMe.parse_strict(source, options={})
|
34
|
-
StrictParserBuilder.new.parse(source
|
30
|
+
StrictParserBuilder.new(options).parse(source)
|
35
31
|
end
|
36
32
|
|
33
|
+
# This class is used to create promiscuous parsers.
|
37
34
|
class ParserBuilder
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
# The options passed to this ParserBuilder's constructor.
|
36
|
+
attr_reader :options
|
37
|
+
# The tags that are parsed for RSS feeds.
|
38
|
+
attr_accessor :rss_tags
|
39
|
+
# The subtags of item elements that are parsed for RSS feeds.
|
40
|
+
attr_accessor :rss_item_tags
|
41
|
+
# The tags that are parsed for Atom feeds.
|
42
|
+
attr_accessor :atom_tags
|
43
|
+
# The subtags of entry elements that are parsed for Atom feeds.
|
44
|
+
attr_accessor :atom_entry_tags
|
45
|
+
# The names of tags that should be parsed as date values.
|
46
|
+
attr_accessor :date_tags
|
47
|
+
# An array of names of attributes/subtags whose values can be
|
48
|
+
# used as the default value of a mixed element.
|
49
|
+
attr_accessor :value_tags
|
50
|
+
# Tags to use for element value when specific tag isn't specified
|
51
|
+
attr_accessor :default_value_tags
|
52
|
+
# A hash of attribute/tag name aliases.
|
53
|
+
attr_accessor :aliases
|
54
|
+
# An array of the transformation functions applied when the !
|
55
|
+
# suffix is added to the attribute/tag name.
|
56
|
+
attr_accessor :default_transformation
|
57
|
+
# Mapping of transformation names to functions. Each key is a
|
58
|
+
# suffix that can be appended to an attribute/tag name, and
|
59
|
+
# the value is an array of transformation function names that
|
60
|
+
# are applied when that transformation is used.
|
61
|
+
attr_accessor :transformations
|
62
|
+
# Mapping of transformation function names to Procs.
|
63
|
+
attr_accessor :transformation_fns
|
64
|
+
# the helper library used for HTML transformations
|
65
|
+
attr_accessor :html_helper_lib
|
41
66
|
|
42
|
-
#
|
43
|
-
|
67
|
+
# Create a new ParserBuilder. Allowed options are:
|
68
|
+
# * :empty_string_for_nil => false # return the empty string instead of a nil value
|
69
|
+
# * :error_on_missing_key => false # raise an error if a specified key or virtual
|
70
|
+
# method does not exist (otherwise nil is returned)
|
71
|
+
def initialize(options={})
|
72
|
+
@options = options
|
73
|
+
|
44
74
|
# rss tags
|
45
75
|
@rss_tags = [
|
46
76
|
{
|
47
77
|
:image => nil,
|
48
|
-
:
|
49
|
-
:
|
50
|
-
:
|
51
|
-
:items => [{ :
|
78
|
+
:textinput => nil,
|
79
|
+
:skiphours => nil,
|
80
|
+
:skipdays => nil,
|
81
|
+
:items => [{ :rdf_seq => nil }],
|
52
82
|
#:item => @rss_item_tags
|
53
83
|
}
|
54
84
|
]
|
@@ -70,14 +100,15 @@ module FeedMe
|
|
70
100
|
]
|
71
101
|
|
72
102
|
# tags whose value is a date
|
73
|
-
@date_tags = [ :
|
103
|
+
@date_tags = [ :pubdate, :lastbuilddate, :published, :updated, :dc_date,
|
104
|
+
:expirationdate ]
|
74
105
|
|
75
|
-
# tags that can be used as the default value for a
|
76
|
-
@value_tags =
|
106
|
+
# tags that can be used as the default value for a mixed element
|
107
|
+
@value_tags = {
|
108
|
+
:media_content => :url
|
109
|
+
}
|
110
|
+
@default_value_tags = [ CONTENT_KEY, :href, :url ]
|
77
111
|
|
78
|
-
# tags that don't become part of the parsed object tree
|
79
|
-
@ghost_tags = [ :'rdf:Seq' ]
|
80
|
-
|
81
112
|
# tag/attribute aliases
|
82
113
|
@aliases = {
|
83
114
|
:items => :item_array,
|
@@ -87,64 +118,130 @@ module FeedMe
|
|
87
118
|
:link => :'link+self'
|
88
119
|
}
|
89
120
|
|
90
|
-
#
|
91
|
-
@
|
92
|
-
@
|
93
|
-
|
94
|
-
|
121
|
+
# transformations
|
122
|
+
@html_helper_lib = HPRICOT_HELPER
|
123
|
+
@default_transformation = [ :cleanHtml ]
|
124
|
+
@transformations = {}
|
125
|
+
@transformation_fns = {
|
126
|
+
# remove all HTML tags
|
127
|
+
:stripHtml => proc do |str|
|
128
|
+
require @html_helper_lib
|
129
|
+
FeedMe.html_helper.strip_html(str)
|
130
|
+
end,
|
131
|
+
|
132
|
+
# clean HTML content using FeedNormalizer's HtmlCleaner class
|
133
|
+
:cleanHtml => proc do |str|
|
134
|
+
require @html_helper_lib
|
135
|
+
FeedMe.html_helper.clean_html(str)
|
136
|
+
end,
|
137
|
+
|
138
|
+
# wrap text at a certain number of characters (respecting word boundaries)
|
139
|
+
:wrap => proc do |str, col|
|
140
|
+
str.gsub(/(.{1,#{col}})( +|$\n?)|(.{1,#{col}})/, "\\1\\3\n").strip
|
141
|
+
end,
|
142
|
+
|
143
|
+
# truncate text, respecting word boundaries
|
144
|
+
:trunc => proc {|str, wordcount| str.trunc(wordcount.to_i) },
|
145
|
+
|
146
|
+
# truncate HTML and leave enclosing HTML tags
|
147
|
+
:truncHtml => proc do |str, wordcount|
|
148
|
+
require @html_helper_lib
|
149
|
+
FeedMe.html_helper.truncate_html(str, wordcount.to_i)
|
150
|
+
end,
|
151
|
+
|
152
|
+
:regexp => proc do |str, regexp|
|
153
|
+
match = Regexp.new(regexp).match(str)
|
154
|
+
match.nil? ? nil : match[1]
|
155
|
+
end,
|
95
156
|
}
|
96
157
|
end
|
97
158
|
|
159
|
+
# Prepare tag list for an RSS feed.
|
98
160
|
def all_rss_tags
|
99
161
|
all_tags = rss_tags.dup
|
100
162
|
all_tags[0][:item] = rss_item_tags.dup
|
101
163
|
return all_tags
|
102
164
|
end
|
103
165
|
|
166
|
+
# Prepare tag list for an Atom feed.
|
104
167
|
def all_atom_tags
|
105
168
|
all_tags = atom_tags.dup
|
106
169
|
all_tags[0][:entry] = atom_entry_tags.dup
|
107
170
|
return all_tags
|
108
171
|
end
|
109
172
|
|
110
|
-
|
173
|
+
# Add aliases so that Atom feed elements can be accessed
|
174
|
+
# using the names of their RSS counterparts.
|
175
|
+
def emulate_rss!
|
176
|
+
aliases.merge!({
|
177
|
+
:guid => :id, # this alias never actually gets used; see FeedData#id
|
178
|
+
:copyright => :rights,
|
179
|
+
:pubdate => [ :published, :updated ],
|
180
|
+
:lastbuilddate => [ :updated, :published ],
|
181
|
+
:description => [ :content, :summary ],
|
182
|
+
:managingeditor => [ :'author/name', :'contributor/name' ],
|
183
|
+
:webmaster => [ :'author/name', :'contributor/name' ],
|
184
|
+
:image => [ :icon, :logo ]
|
185
|
+
})
|
186
|
+
end
|
187
|
+
|
188
|
+
# Add aliases so that RSS feed elements can be accessed
|
189
|
+
# using the names of their Atom counterparts.
|
190
|
+
def emulate_atom!
|
191
|
+
aliases.merge!({
|
192
|
+
:rights => :copyright,
|
193
|
+
:content => :description,
|
194
|
+
:contributor => :author,
|
195
|
+
:id => [ :guid_value, :link ],
|
196
|
+
:author => [ :managingeditor, :webmaster ],
|
197
|
+
:updated => [ :lastbuilddate, :pubdate ],
|
198
|
+
:published => [ :pubDate, :lastbuilddate ],
|
199
|
+
:icon => :'image/url',
|
200
|
+
:logo => :'image/url',
|
201
|
+
:summary => :'description_trunc'
|
202
|
+
})
|
203
|
+
end
|
204
|
+
|
205
|
+
# Parse +source+ using a +Parser+ created from this +ParserBuilder+.
|
206
|
+
def parse(source)
|
111
207
|
Parser.new(self, source, options)
|
112
208
|
end
|
113
209
|
end
|
114
210
|
|
211
|
+
#
|
115
212
|
class StrictParserBuilder < ParserBuilder
|
116
|
-
attr_accessor :feed_ext_tags, :item_ext_tags
|
213
|
+
attr_accessor :feed_ext_tags, :item_ext_tags, :rels
|
117
214
|
|
118
|
-
def initialize
|
119
|
-
super()
|
215
|
+
def initialize(options={})
|
216
|
+
super(options)
|
120
217
|
|
121
218
|
# rss tags
|
122
219
|
@rss_tags = [
|
123
220
|
{
|
124
221
|
:image => [ :url, :title, :link, :width, :height, :description ],
|
125
|
-
:
|
126
|
-
:
|
127
|
-
:
|
222
|
+
:textinput => [ :title, :description, :name, :link ],
|
223
|
+
:skiphours => [ :hour ],
|
224
|
+
:skipdays => [ :day ],
|
128
225
|
:items => [
|
129
226
|
{
|
130
|
-
:
|
227
|
+
:rdf_seq => [ :rdf_li ]
|
131
228
|
},
|
132
|
-
:
|
229
|
+
:rdf_seq
|
133
230
|
],
|
134
231
|
#:item => @item_tags
|
135
232
|
},
|
136
233
|
:title, :link, :description, # required
|
137
|
-
:language, :copyright, :
|
138
|
-
:
|
234
|
+
:language, :copyright, :managingeditor, :webmaster, # optional
|
235
|
+
:pubdate, :lastbuilddate, :category, :generator,
|
139
236
|
:docs, :cloud, :ttl, :rating,
|
140
|
-
:image, :
|
237
|
+
:image, :textinput, :skiphours, :skipdays, :item, # have subtags
|
141
238
|
:items
|
142
239
|
]
|
143
240
|
@rss_item_tags = [
|
144
241
|
{},
|
145
242
|
:title, :description, # required
|
146
243
|
:link, :author, :category, :comments, :enclosure, # optional
|
147
|
-
:guid, :
|
244
|
+
:guid, :pubdate, :source, :expirationdate
|
148
245
|
]
|
149
246
|
|
150
247
|
#atom tags
|
@@ -157,9 +254,7 @@ module FeedMe
|
|
157
254
|
},
|
158
255
|
:id, :author, :title, :updated, # required
|
159
256
|
:category, :contributor, :generator, :icon, :logo, # optional
|
160
|
-
:
|
161
|
-
:'link+replies', :'link+related', :'link+enclosure',
|
162
|
-
:'link+via', :rights, :subtitle
|
257
|
+
:link, :rights, :subtitle
|
163
258
|
]
|
164
259
|
@atom_entry_tags = [
|
165
260
|
{
|
@@ -167,22 +262,25 @@ module FeedMe
|
|
167
262
|
:contributor => person_tags
|
168
263
|
},
|
169
264
|
:id, :author, :title, :updated, :summary, # required
|
170
|
-
:category, :content, :contributor, :
|
171
|
-
:
|
172
|
-
:'link+related', :'link+enclosure', :published,
|
173
|
-
:rights, :source
|
265
|
+
:category, :content, :contributor, :link,
|
266
|
+
:published, :rights, :source
|
174
267
|
]
|
175
268
|
|
269
|
+
@rels = {
|
270
|
+
:link => [ 'self', 'alternate', 'edit', 'replies', 'related', 'enclosure', 'via' ]
|
271
|
+
}
|
272
|
+
|
176
273
|
# extensions
|
177
274
|
@feed_ext_tags = [
|
178
|
-
:
|
179
|
-
:
|
275
|
+
:dc_date, :feedburner_browserfriendly,
|
276
|
+
:itunes_author, :itunes_category
|
180
277
|
]
|
181
278
|
@item_ext_tags = [
|
182
|
-
:
|
183
|
-
:
|
184
|
-
:
|
185
|
-
:
|
279
|
+
:dc_date, :dc_subject, :dc_creator,
|
280
|
+
:dc_title, :dc_rights, :dc_publisher,
|
281
|
+
:trackback_ping, :trackback_about,
|
282
|
+
:feedburner_origlink, :media_content,
|
283
|
+
:content_encoded
|
186
284
|
]
|
187
285
|
end
|
188
286
|
|
@@ -202,46 +300,69 @@ module FeedMe
|
|
202
300
|
class FeedData
|
203
301
|
attr_reader :fm_tag_name, :fm_parent, :fm_builder
|
204
302
|
|
205
|
-
def initialize(tag_name, parent, builder
|
303
|
+
def initialize(tag_name, parent, builder)
|
206
304
|
@fm_tag_name = tag_name
|
207
305
|
@fm_parent = parent
|
208
306
|
@fm_builder = builder
|
209
|
-
@data =
|
307
|
+
@data = {}
|
210
308
|
end
|
211
309
|
|
212
310
|
def key?(key)
|
213
|
-
@data.key?(key)
|
311
|
+
@data.key?(clean_tag(key))
|
214
312
|
end
|
215
313
|
|
216
314
|
def keys
|
217
315
|
@data.keys
|
218
316
|
end
|
219
317
|
|
318
|
+
def delete(key)
|
319
|
+
@data.delete(clean_tag(key))
|
320
|
+
end
|
321
|
+
|
322
|
+
def each
|
323
|
+
@data.each {|key, value| yield(key, value) }
|
324
|
+
end
|
325
|
+
|
326
|
+
def each_with_index
|
327
|
+
@data.each_with_index {|key, value, index| yield(key, value, index) }
|
328
|
+
end
|
329
|
+
|
330
|
+
def size
|
331
|
+
@data.size
|
332
|
+
end
|
333
|
+
|
220
334
|
def [](key)
|
221
|
-
@data[key]
|
335
|
+
@data[clean_tag(key)]
|
222
336
|
end
|
223
337
|
|
224
338
|
def []=(key, value)
|
225
|
-
@data[key] = value
|
339
|
+
@data[clean_tag(key)] = value
|
340
|
+
end
|
341
|
+
|
342
|
+
# special handling for atom id tags, due to conflict with
|
343
|
+
# ruby's Object#id method
|
344
|
+
def id
|
345
|
+
key?(:id) ? self[:id] : call_virtual_method(:id)
|
226
346
|
end
|
227
347
|
|
228
348
|
def to_s
|
229
|
-
|
349
|
+
to_indented_s
|
230
350
|
end
|
231
351
|
|
232
|
-
def
|
233
|
-
|
352
|
+
def to_indented_s(indent_step=2)
|
353
|
+
FeedMe.pretty_to_s(self, indent_step, 0, Proc.new do |key, value|
|
354
|
+
(value.is_a?(Array) && value.size == 1) ? [unarrayize(key), value.first] : [key, value]
|
355
|
+
end)
|
234
356
|
end
|
235
357
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
return key + '_array'
|
358
|
+
def method_missing(name, *args)
|
359
|
+
result = begin
|
360
|
+
call_virtual_method(name, args)
|
361
|
+
rescue NameError
|
362
|
+
raise if fm_builder.options[:error_on_missing_key]
|
363
|
+
end
|
364
|
+
result = '' if result.nil? and fm_builder.options[:empty_string_for_nil]
|
365
|
+
result
|
245
366
|
end
|
246
367
|
|
247
368
|
# There are several virtual methods for each attribute/tag.
|
@@ -263,70 +384,146 @@ module FeedMe
|
|
263
384
|
# array.size.
|
264
385
|
# 7. If the tag name is of the form "tag+rel", the tag having the
|
265
386
|
# specified rel value is returned
|
266
|
-
def call_virtual_method(
|
387
|
+
def call_virtual_method(sym, args=[], history=[])
|
267
388
|
# make sure we don't get stuck in an infinite loop
|
268
389
|
history.each do |call|
|
269
|
-
if call[0] == fm_tag_name and call[1] ==
|
270
|
-
|
271
|
-
puts self.inspect
|
272
|
-
raise FeedMe::InfiniteCallLoopError.new(name, history)
|
390
|
+
if call[0] == fm_tag_name and call[1] == sym
|
391
|
+
raise FeedMe::InfiniteCallLoopError.new(sym, history)
|
273
392
|
end
|
274
393
|
end
|
275
|
-
history << [ fm_tag_name,
|
394
|
+
history << [ fm_tag_name, sym ]
|
276
395
|
|
277
|
-
|
278
|
-
name = clean_tag(name)
|
396
|
+
name = clean_tag(sym)
|
279
397
|
name_str = name.to_s
|
280
|
-
array_key =
|
281
|
-
|
282
|
-
|
398
|
+
array_key = arrayize(name.to_s)
|
399
|
+
|
400
|
+
result = if key? name
|
401
|
+
self[name]
|
402
|
+
elsif key? array_key
|
403
|
+
self[array_key].first
|
404
|
+
elsif name_str[-1,1] == '?'
|
283
405
|
!call_virtual_method(name_str[0..-2], args, history).nil? rescue false
|
284
406
|
elsif name_str[-1,1] == '!'
|
285
407
|
value = call_virtual_method(name_str[0..-2], args, history)
|
286
|
-
fm_builder.
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
value = fm_builder.bang_mod_fns[bm_key].call(value, *parts[1..-1])
|
408
|
+
_transform(fm_builder.default_transformation, value)
|
409
|
+
elsif name_str =~ /(.+)_values/
|
410
|
+
call_virtual_method(arrayize($1), args, history).collect do |value|
|
411
|
+
_resolve_value value
|
291
412
|
end
|
292
|
-
return value
|
293
|
-
elsif key? name
|
294
|
-
self[name]
|
295
|
-
elsif key? array_key
|
296
|
-
self[array_key].first
|
297
413
|
elsif name_str =~ /(.+)_value/
|
414
|
+
_resolve_value call_virtual_method($1, args, history)
|
415
|
+
elsif name_str =~ /(.+)_count/
|
416
|
+
call_virtual_method(arrayize($1), args, history).size
|
417
|
+
elsif name_str =~ /(.+)_(.+)/ && fm_builder.transformations.key?($2)
|
298
418
|
value = call_virtual_method($1, args, history)
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
419
|
+
_transform(fm_builder.transformations[$2], value)
|
420
|
+
elsif name_str.include?('/') # this is only intended to be used internally
|
421
|
+
value = self
|
422
|
+
name_str.split('/').each do |p|
|
423
|
+
parts = p.split('_')
|
424
|
+
name = clean_tag(parts[0])
|
425
|
+
new_args = parts.size > 1 ? parts[1..-1] : args
|
426
|
+
value = (value.method(name).call(*new_args) rescue
|
427
|
+
value.call_virtual_method(name, new_args, history)) rescue nil
|
428
|
+
break if value.nil?
|
305
429
|
end
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
call_virtual_method(
|
430
|
+
value
|
431
|
+
elsif name_str.include?('+')
|
432
|
+
name_data = name_str.split('+')
|
433
|
+
rel = name_data[1]
|
434
|
+
value = nil
|
435
|
+
call_virtual_method(arrayize(name_data[0]), args, history).each do |elt|
|
312
436
|
next unless elt.is_a?(FeedData) and elt.rel?
|
313
|
-
|
437
|
+
value = elt if elt.rel.casecmp(rel) == 0
|
438
|
+
break unless value.nil?
|
314
439
|
end
|
440
|
+
value
|
315
441
|
elsif fm_builder.aliases.key? name
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
if item[:'rdf:about'] == uri
|
324
|
-
return item.call_virtual_method(name, args, history)
|
325
|
-
end
|
442
|
+
names = fm_builder.aliases[name]
|
443
|
+
names = [names] unless names.is_a? Array
|
444
|
+
value = nil
|
445
|
+
names.each do |name|
|
446
|
+
value = (method(name).call(*args) rescue
|
447
|
+
call_virtual_method(name, args, history)) rescue next
|
448
|
+
break unless value.nil?
|
326
449
|
end
|
450
|
+
value
|
327
451
|
else
|
328
|
-
|
452
|
+
nil
|
453
|
+
end
|
454
|
+
|
455
|
+
raise NameError.new("No such method '#{name}'", name) if result.nil?
|
456
|
+
|
457
|
+
result
|
458
|
+
end
|
459
|
+
|
460
|
+
# Apply transformations to a tag value. Can either accept a transformation
|
461
|
+
# name or an array of transformation function names.
|
462
|
+
def transform(tag, trans)
|
463
|
+
value = call_virtual_method(tag) or return nil
|
464
|
+
transformations = trans.is_a?(String) ?
|
465
|
+
fm_builder.transformations[trans] : trans
|
466
|
+
_transform(transformations, value)
|
467
|
+
end
|
468
|
+
|
469
|
+
protected
|
470
|
+
|
471
|
+
def clean_tag(tag)
|
472
|
+
tag.to_s.downcase.gsub(':','_').intern
|
473
|
+
end
|
474
|
+
|
475
|
+
# generate a name for the array variable corresponding to a single-value variable
|
476
|
+
def arrayize(key)
|
477
|
+
clean_tag(key.to_s + '_array')
|
478
|
+
end
|
479
|
+
|
480
|
+
def unarrayize(key)
|
481
|
+
clean_tag(key.to_s.gsub(/_array$/, ''))
|
482
|
+
end
|
483
|
+
|
484
|
+
private
|
485
|
+
|
486
|
+
def _transform(trans_array, value)
|
487
|
+
trans_array.each do |t|
|
488
|
+
if t.is_a? String
|
489
|
+
value = _transform(fm_builder.transformations[t], value)
|
490
|
+
else
|
491
|
+
if t.is_a? Symbol
|
492
|
+
t_name = t
|
493
|
+
args = []
|
494
|
+
elsif t[0].is_a? Array
|
495
|
+
raise 'array where symbol expected'
|
496
|
+
else
|
497
|
+
t_name = t[0]
|
498
|
+
args = t[1..-1]
|
499
|
+
end
|
500
|
+
|
501
|
+
trans = fm_builder.transformation_fns[t_name] or
|
502
|
+
raise NameError.new("No such transformation #{t_name}", t_name)
|
503
|
+
|
504
|
+
if value.is_a? Array
|
505
|
+
value = value.collect {|x| trans.call(x, *args) }
|
506
|
+
else
|
507
|
+
value = trans.call(value, *args)
|
508
|
+
end
|
509
|
+
end
|
510
|
+
end
|
511
|
+
value
|
512
|
+
end
|
513
|
+
|
514
|
+
def _resolve_value(obj)
|
515
|
+
value = obj
|
516
|
+
if obj.is_a?(FeedData)
|
517
|
+
if fm_builder.value_tags.key? obj.fm_tag_name
|
518
|
+
value = obj.call_virtual_method(fm_builder.value_tags[obj.fm_tag_name])
|
519
|
+
else
|
520
|
+
fm_builder.default_value_tags.each do |tag|
|
521
|
+
value = obj.call_virtual_method(tag) rescue next
|
522
|
+
break unless value.nil?
|
523
|
+
end
|
524
|
+
end
|
329
525
|
end
|
526
|
+
value
|
330
527
|
end
|
331
528
|
end
|
332
529
|
|
@@ -346,19 +543,31 @@ module FeedMe
|
|
346
543
|
alias :feed :channel
|
347
544
|
|
348
545
|
def fm_tag_name
|
349
|
-
@fm_type == FeedMe::
|
546
|
+
@fm_type == FeedMe::ATOM ? 'feed' : 'channel'
|
547
|
+
end
|
548
|
+
|
549
|
+
def fm_prefix
|
550
|
+
fm_type.to_s.downcase
|
350
551
|
end
|
351
552
|
|
352
553
|
private
|
353
554
|
|
354
555
|
def parse
|
355
556
|
# RSS = everything between channel tags + everthing between </channel> and </rdf> if this is an RDF document
|
356
|
-
if @fm_source =~ %r{<(?:.*?:)?(
|
357
|
-
@fm_type =
|
557
|
+
if @fm_source =~ %r{<(?:.*?:)?(rss|rdf)(.*?)>.*?<(?:.*?:)?channel(.*?)>(.+)</(?:.*?:)?channel>(.*)</(?:.*?:)?(?:rss|rdf)>}mi
|
558
|
+
@fm_type = $2.upcase.to_s
|
358
559
|
@fm_tags = fm_builder.all_rss_tags
|
359
|
-
attrs = parse_attributes($1, $
|
560
|
+
attrs = parse_attributes($1, $3)
|
360
561
|
attrs[:version] ||= '1.0';
|
361
|
-
parse_content(self, attrs, $
|
562
|
+
parse_content(self, attrs, $4, @fm_tags)
|
563
|
+
|
564
|
+
# for RDF documents, replace references with actual items
|
565
|
+
unless nil_or_empty?($5)
|
566
|
+
refs = FeedData.new(nil, nil, fm_builder)
|
567
|
+
parse_content(refs, {}, $5, @fm_tags)
|
568
|
+
dereference_rdf_tags(:items_array, :item_array, refs) {|a| a.first[:rdf_seq_array].first[:rdf_li_array] }
|
569
|
+
[:image_array, :textinput_array].each {|tag| dereference_rdf_tags(tag, tag, refs) }
|
570
|
+
end
|
362
571
|
# Atom = everthing between feed tags
|
363
572
|
elsif @fm_source =~ %r{<(?:.*?:)?feed(.*?)>(.+)</(?:.*?:)?feed>}mi
|
364
573
|
@fm_type = FeedMe::ATOM
|
@@ -369,21 +578,37 @@ module FeedMe
|
|
369
578
|
end
|
370
579
|
end
|
371
580
|
|
581
|
+
# References within the <channel> element are replaced by the actual
|
582
|
+
def dereference_rdf_tags(rdf_tag, rss_tag, refs)
|
583
|
+
if self.key?(rdf_tag)
|
584
|
+
src_items = self.delete(rdf_tag)
|
585
|
+
src_items = yield(src_items) if block_given?
|
586
|
+
ref_items = refs[rss_tag]
|
587
|
+
unless src_items.empty? || ref_items.empty?
|
588
|
+
self[rss_tag] = src_items.collect do |src_item|
|
589
|
+
next unless src_item.key?(:rdf_resource)
|
590
|
+
uri = src_item[:rdf_resource]
|
591
|
+
ref_items.each do |ref_item|
|
592
|
+
next unless ref_item.key?(:rdf_about)
|
593
|
+
if (ref_item[:rdf_about].eql?(uri))
|
594
|
+
ref_item[:rdf_resource] = uri
|
595
|
+
break ref_item
|
596
|
+
end
|
597
|
+
end
|
598
|
+
end
|
599
|
+
end
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
372
603
|
def parse_content(parent, attrs, content, tags)
|
373
604
|
# add attributes to parent
|
374
|
-
attrs.each_pair {|key, value|
|
375
|
-
|
376
|
-
|
377
|
-
first_tag = 0
|
378
|
-
if !tags.nil? && tags[0].is_a?(Hash)
|
379
|
-
sub_tags = tags[0]
|
380
|
-
first_tag = 1
|
381
|
-
end
|
382
|
-
|
605
|
+
attrs.each_pair {|key, value| parent[key] = unescape(value) }
|
606
|
+
return if content.nil?
|
607
|
+
|
383
608
|
# split the content into elements
|
384
609
|
elements = {}
|
385
|
-
|
386
|
-
content.scan( %r{(<(
|
610
|
+
# TODO: this will break if a namespace is used that is not rss: or atom:
|
611
|
+
content.scan( %r{(<([\w:]+)(.*?)(?:/>|>(.*?)</\2>))}mi ) do |match|
|
387
612
|
# \1 = full content (from start to end tag), \2 = tag name
|
388
613
|
# \3 = attributes, and \4 = content between tags
|
389
614
|
key = clean_tag(match[1])
|
@@ -395,33 +620,37 @@ module FeedMe
|
|
395
620
|
end
|
396
621
|
end
|
397
622
|
|
398
|
-
#
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
623
|
+
# the first item in a tag array may be a hash that defines tags that have subtags
|
624
|
+
sub_tags = tags[0] if !nil_or_empty?(tags) && tags[0].is_a?(Hash)
|
625
|
+
first_tag = sub_tags.nil? || tags.size == 1 ? 0 : 1
|
626
|
+
# if this is a promiscuous parser, tag names will depend on the elements found in the feed
|
627
|
+
tags = elements.keys if (sub_tags.nil? ? nil_or_empty?(tags) : first_tag == 0)
|
628
|
+
|
404
629
|
# iterate over all tags (some or all of which may not be present)
|
405
630
|
tags[first_tag..-1].each do |tag|
|
406
631
|
key = clean_tag(tag)
|
407
|
-
|
632
|
+
element_array = elements.delete(tag) or next
|
408
633
|
@fm_parsed << key
|
409
634
|
|
410
635
|
element_array.each do |elt|
|
636
|
+
elt_attrs = elt[0]
|
637
|
+
elt_content = elt[1]
|
638
|
+
rels = fm_builder.rels[key] if fm_builder.respond_to?(:rels)
|
639
|
+
|
640
|
+
# if a list of accepted rels is specified, only parse this tag
|
641
|
+
# if its rel attribute is inlcuded in the list
|
642
|
+
next unless rels.nil? || elt_attrs.nil? || !elt_attrs.rel? || rels.include?(elt_attrs.rel)
|
643
|
+
|
411
644
|
if !sub_tags.nil? && sub_tags.key?(key)
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
new_parent = FeedData.new(key, parent, fm_builder)
|
416
|
-
add_tag(parent, key, new_parent)
|
417
|
-
end
|
418
|
-
parse_content(new_parent, elt[0], elt[1], sub_tags[key])
|
645
|
+
new_parent = FeedData.new(key, parent, fm_builder)
|
646
|
+
add_tag(parent, key, new_parent)
|
647
|
+
parse_content(new_parent, elt_attrs, elt_content, sub_tags[key])
|
419
648
|
else
|
420
|
-
add_tag(parent, key, clean_content(key,
|
649
|
+
add_tag(parent, key, clean_content(key, elt_attrs, elt_content, parent))
|
421
650
|
end
|
422
651
|
end
|
423
652
|
end
|
424
|
-
|
653
|
+
|
425
654
|
@fm_unparsed += elements.keys
|
426
655
|
|
427
656
|
@fm_parsed.uniq!
|
@@ -429,7 +658,7 @@ module FeedMe
|
|
429
658
|
end
|
430
659
|
|
431
660
|
def add_tag(hash, key, value)
|
432
|
-
array_var =
|
661
|
+
array_var = arrayize(key)
|
433
662
|
if hash.key? array_var
|
434
663
|
hash[array_var] << value
|
435
664
|
else
|
@@ -446,18 +675,19 @@ module FeedMe
|
|
446
675
|
content = content.to_s
|
447
676
|
if fm_builder.date_tags.include? tag
|
448
677
|
content = Time.parse(content) rescue unescape(content)
|
449
|
-
else
|
450
|
-
|
678
|
+
else
|
679
|
+
content = unescape(content)
|
451
680
|
end
|
452
681
|
|
453
682
|
unless attrs.empty?
|
454
|
-
hash = FeedData.new(tag, parent, fm_builder
|
683
|
+
hash = FeedData.new(tag, parent, fm_builder)
|
684
|
+
attrs.each_pair {|key, value| hash[key] = unescape(value) }
|
455
685
|
if !content.empty?
|
456
686
|
hash[FeedMe::CONTENT_KEY] = content
|
457
687
|
end
|
458
688
|
return hash
|
459
689
|
end
|
460
|
-
|
690
|
+
|
461
691
|
return content
|
462
692
|
end
|
463
693
|
|
@@ -466,9 +696,9 @@ module FeedMe
|
|
466
696
|
attrs.each do |a|
|
467
697
|
next if a.nil?
|
468
698
|
# pull key/value pairs out of attr string
|
469
|
-
array = a.scan(/(\w+)=['"]?([^'"]+)/)
|
699
|
+
array = a.scan(/([\w:]+)=['"]?([^'"]+)/)
|
470
700
|
# unescape values
|
471
|
-
array = array.collect {|key, value| [clean_tag(
|
701
|
+
array = array.collect {|key, value| [clean_tag(key), unescape(value)]}
|
472
702
|
hash.merge! Hash[*array.flatten]
|
473
703
|
end
|
474
704
|
return hash
|
@@ -484,32 +714,10 @@ module FeedMe
|
|
484
714
|
content = cdata[1] if cdata
|
485
715
|
|
486
716
|
return content
|
487
|
-
|
488
|
-
#if content =~ /([^-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]%)/n then
|
489
|
-
# CGI.unescapeHTML(content).gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
|
490
|
-
#else
|
491
|
-
# content.gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
|
492
|
-
#end
|
493
|
-
end
|
494
|
-
|
495
|
-
def underscore(camel_cased_word)
|
496
|
-
camel_cased_word.to_s.gsub(/::/, '/').
|
497
|
-
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
498
|
-
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
499
|
-
tr("-", "_").
|
500
|
-
downcase
|
501
|
-
end
|
502
|
-
|
503
|
-
def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
|
504
|
-
if first_letter_in_uppercase
|
505
|
-
lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
506
|
-
else
|
507
|
-
lower_case_and_underscored_word[0,1].downcase + camelize(lower_case_and_underscored_word)[1..-1]
|
508
|
-
end
|
509
717
|
end
|
510
718
|
|
511
|
-
def
|
512
|
-
obj.nil? ?
|
719
|
+
def nil_or_empty?(obj)
|
720
|
+
obj.nil? || obj.empty? || (obj.is_a?(String) && obj.strip.empty?)
|
513
721
|
end
|
514
722
|
end
|
515
723
|
|