cdamian-feedlib 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Damian Caruso
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,19 @@
1
+ ## feedlib
2
+
3
+ Usage:
4
+
5
+ feedlib = Feedlib.instance
6
+ feed = feedlib.fetch("http://domain")
7
+ puts feed.title
8
+ puts feed.description
9
+ feed.entries.each do |entry|
10
+ puts entry.link
11
+ puts entry.author
12
+ puts entry.title
13
+ puts entry.content
14
+ puts entry.published_at
15
+ end
16
+
17
+ ## Copyright
18
+
19
+ Copyright (c) 2009 Damian Caruso. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,61 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "feedlib"
8
+ gem.summary = %Q{Feed library for building and parsing Atom and RSS feeds}
9
+ gem.email = "damian.caruso@gmail.com"
10
+ gem.homepage = "http://github.com/cdamian/feedlib"
11
+ gem.authors = ["Damian Caruso"]
12
+ gem.description = %Q{Feed library for building and parsing Atom and RSS feeds}
13
+ gem.files = FileList['lib/**/*.rb', '[A-Z]*', 'test/**/*'].to_a
14
+ gem.add_dependency("hpricot", ">= 0.8.1")
15
+ gem.add_dependency("chardet", ">= 0.9.0")
16
+ gem.add_dependency("htmlentities", ">= 4.0.0")
17
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
18
+ end
19
+
20
+ rescue LoadError
21
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
22
+ end
23
+
24
+ require 'rake/testtask'
25
+ Rake::TestTask.new(:test) do |test|
26
+ test.libs << 'lib' << 'test'
27
+ test.pattern = 'test/**/*_test.rb'
28
+ test.verbose = true
29
+ end
30
+
31
+ begin
32
+ require 'rcov/rcovtask'
33
+ Rcov::RcovTask.new do |test|
34
+ test.libs << 'test'
35
+ test.pattern = 'test/**/*_test.rb'
36
+ test.verbose = true
37
+ end
38
+ rescue LoadError
39
+ task :rcov do
40
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
41
+ end
42
+ end
43
+
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ if File.exist?('VERSION.yml')
50
+ config = YAML.load(File.read('VERSION.yml'))
51
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
52
+ else
53
+ version = ""
54
+ end
55
+
56
+ rdoc.rdoc_dir = 'rdoc'
57
+ rdoc.title = "feedlib #{version}"
58
+ rdoc.rdoc_files.include('README*')
59
+ rdoc.rdoc_files.include('lib/**/*.rb')
60
+ end
61
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,5 @@
1
+ class FeedLibError < StandardError;end
2
+
3
+ class UnavailableFeedSourceError < FeedLibError;end
4
+
5
+ class UnknownFeedTypeError < FeedLibError;end
@@ -0,0 +1,4 @@
1
+ require 'feedlib/feed'
2
+
3
+ class FeedAtom < Feed
4
+ end
@@ -0,0 +1,4 @@
1
+ require 'feedlib/feed/entry'
2
+
3
+ class FeedEntryAtom < FeedEntry
4
+ end
@@ -0,0 +1,4 @@
1
+ require 'feedlib/feed/entry'
2
+
3
+ class FeedEntryRss < FeedEntry
4
+ end
@@ -0,0 +1,11 @@
1
+ class FeedEntry
2
+ attr_accessor :link, :author, :title, :content, :published_at
3
+
4
+ def initialize
5
+ @link = nil
6
+ @author = nil
7
+ @title = nil
8
+ @content = nil
9
+ @published_at = nil
10
+ end
11
+ end
@@ -0,0 +1,4 @@
1
+ require 'feedlib/feed'
2
+
3
+ class FeedRss < Feed
4
+ end
@@ -0,0 +1,14 @@
1
+ class Feed
2
+ attr_accessor :title, :description, :url, :entries
3
+
4
+ def initialize
5
+ @title = nil
6
+ @description = nil
7
+ @url = nil
8
+ @entries = []
9
+ end
10
+
11
+ def add_entry(entry)
12
+ entries << entry
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ require 'feedlib/parser'
2
+ require 'feedlib/feed/atom'
3
+ require 'feedlib/feed/entry/atom'
4
+
5
+ class FeedAtomParser < FeedParser
6
+ def parse_feed
7
+ @feed = FeedAtom.new
8
+ (@source_xml/:entry).each do |entry|
9
+ new_entry = parse_entry(entry)
10
+ @feed.add_entry(new_entry)
11
+ end
12
+ end
13
+
14
+ protected
15
+ def parse_entry(entry)
16
+ new_entry = FeedEntryAtom.new
17
+ new_entry.link = (entry/:link).attr('href')
18
+ new_entry.author = (entry/:author/:name).inner_html
19
+ new_entry.title = sanitize((entry/:title).inner_html)
20
+ new_entry.content = (entry/:content).inner_html.blank? ? sanitize((entry/:content).inner_html) : sanitize((entry/:summary).inner_html)
21
+ new_entry.published_at = (entry/:published).inner_html
22
+ new_entry
23
+ end
24
+ end
@@ -0,0 +1,24 @@
1
+ require 'feedlib/parser'
2
+ require 'feedlib/feed/rss'
3
+ require 'feedlib/feed/entry/rss'
4
+
5
+ class FeedRssParser < FeedParser
6
+ def parse_feed
7
+ @feed = FeedRss.new
8
+ (@source_xml/:channel/:item).each do |entry|
9
+ new_entry = parse_entry(entry)
10
+ @feed.add_entry(new_entry)
11
+ end
12
+ end
13
+
14
+ protected
15
+ def parse_entry(entry)
16
+ new_entry = FeedEntryRss.new
17
+ new_entry.link = (item/:link).inner_html
18
+ new_entry.author = (item/:author).inner_html
19
+ new_entry.title = sanitize((item/:title).inner_html)
20
+ new_entry.content = sanitize((item/:description).inner_html)
21
+ new_entry.published_at = (item/:pubDate).inner_html
22
+ new_entry
23
+ end
24
+ end
@@ -0,0 +1,50 @@
1
+ class FeedParser
2
+ attr_reader :feed
3
+
4
+ def initialize(source_xml)
5
+ @source_xml = source_xml
6
+ parse_feed
7
+ end
8
+
9
+ protected
10
+ def sanitize(html)
11
+ whitelist = %w(em i strong u)
12
+ attrs = {}
13
+ blacklist = %w(script)
14
+
15
+ whitelist += attrs.keys
16
+
17
+ html.gsub!('&lt;', '<')
18
+ html.gsub!('&gt;', '>')
19
+ html.gsub!('&amp;', '&')
20
+ html.gsub!('&#39;', "'")
21
+ html.gsub!('&quot;', '"')
22
+ html.gsub!('<![CDATA[<![CDATA[', '')
23
+ html.gsub!('<![CDATA[', '')
24
+ html.gsub!(']]>', '')
25
+ html.gsub!(']>', '')
26
+ page = Hpricot(html)
27
+
28
+ page.search("*").each do |e|
29
+ if e.elem?
30
+ tagname = e.name.downcase
31
+ if blacklist.include?(tagname)
32
+ e.swap("")
33
+ elsif !whitelist.include?(tagname)
34
+ e.parent.replace_child(e, e.children)
35
+ elsif attrs.has_key?(tagname)
36
+ e.attributes.delete_if { |key,val| !attrs[tagname].include?(key.downcase)}
37
+ else
38
+ e.attributes = {}
39
+ end
40
+ elsif e.comment?
41
+ # HTML comments can contain executable scripts, depending on the browser, so we'll
42
+ # be paranoid and just get rid of all of them
43
+ # e.g. <!--[if lt IE 7]><script type="text/javascript">h4x0r();</script><![endif]-->
44
+ e.swap('')
45
+ end
46
+ end
47
+ coder = HTMLEntities.new
48
+ coder.decode(page.to_s)
49
+ end
50
+ end
data/lib/feedlib.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'open-uri'
2
+ require 'hpricot'
3
+ require 'UniversalDetector'
4
+ require 'feedlib/errors'
5
+ require 'feedlib/parser/atom'
6
+ require 'feedlib/parser/rss'
7
+
8
+ class Feedlib
9
+ include Singleton
10
+
11
+ def fetch(url)
12
+ begin
13
+ f = open(url, 'r')
14
+ xml = f.read()
15
+ encoding = UniversalDetector::encoding(xml)
16
+ xml = Iconv.iconv("UTF-8", encoding, xml).join
17
+ f.close
18
+ parser = get_parser(xml)
19
+ parser.feed
20
+ rescue OpenURI::HTTPError => e
21
+ raise UnavailableFeedSourceError
22
+ end
23
+ end
24
+
25
+ private
26
+ def get_parser(xml)
27
+ doc = Hpricot.XML(xml)
28
+ if doc.search(:rss).size > 0
29
+ FeedRssParser.new(doc)
30
+ elsif doc.search(:feed).size > 0
31
+ FeedAtomParser.new(doc)
32
+ else
33
+ raise UnknownFeedTypeError
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,7 @@
1
+ require 'test_helper'
2
+
3
+ class FeedlibTest < Test::Unit::TestCase
4
+ def test_something_for_real
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+
4
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ require 'feedlib'
7
+
8
+ class Test::Unit::TestCase
9
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cdamian-feedlib
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Damian Caruso
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-11 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.8.1
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: chardet
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: htmlentities
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 4.0.0
44
+ version:
45
+ description: Feed library for building and parsing Atom and RSS feeds
46
+ email: damian.caruso@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.markdown
54
+ files:
55
+ - LICENSE
56
+ - README.markdown
57
+ - Rakefile
58
+ - VERSION
59
+ - lib/feedlib.rb
60
+ - lib/feedlib/errors.rb
61
+ - lib/feedlib/feed.rb
62
+ - lib/feedlib/feed/atom.rb
63
+ - lib/feedlib/feed/entry.rb
64
+ - lib/feedlib/feed/entry/atom.rb
65
+ - lib/feedlib/feed/entry/rss.rb
66
+ - lib/feedlib/feed/rss.rb
67
+ - lib/feedlib/parser.rb
68
+ - lib/feedlib/parser/atom.rb
69
+ - lib/feedlib/parser/rss.rb
70
+ - test/feedlib_test.rb
71
+ - test/test_helper.rb
72
+ has_rdoc: false
73
+ homepage: http://github.com/cdamian/feedlib
74
+ post_install_message:
75
+ rdoc_options:
76
+ - --charset=UTF-8
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ version:
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ version:
91
+ requirements: []
92
+
93
+ rubyforge_project:
94
+ rubygems_version: 1.2.0
95
+ signing_key:
96
+ specification_version: 3
97
+ summary: Feed library for building and parsing Atom and RSS feeds
98
+ test_files:
99
+ - test/feedlib_test.rb
100
+ - test/test_helper.rb