feedbase 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/NOTES +65 -0
- data/README.markdown +22 -0
- data/Rakefile +23 -0
- data/db/create.sql +40 -0
- data/db/setup.sh +3 -0
- data/feedbase.gemspec +24 -0
- data/lib/feedbase/feed.rb +92 -0
- data/lib/feedbase/feed_parser.rb +153 -0
- data/lib/feedbase/fetch_feed.rb +81 -0
- data/lib/feedbase/html_simplifier.rb +165 -0
- data/lib/feedbase.rb +7 -0
- metadata +78 -0
data/.gitignore
ADDED
data/NOTES
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
------------------------------------------------------------------------
|
2
|
+
Fri Jun 24 08:57:18 EDT 2011
|
3
|
+
|
4
|
+
This will be the feed aggregator engine for Kindlefeeder, but also
|
5
|
+
usable for other projects.
|
6
|
+
|
7
|
+
PostgreSQL
|
8
|
+
|
9
|
+
db/create.sql
|
10
|
+
|
11
|
+
Create a database called 'feeds'
|
12
|
+
|
13
|
+
? rchardet https://rubygems.org/gems/edouard-rchardet
|
14
|
+
|
15
|
+
API
|
16
|
+
|
17
|
+
FetchFeed.from_url [feed-url]
|
18
|
+
|
19
|
+
takes care of getting the feed, parsing, and storing in postgres
|
20
|
+
|
21
|
+
Next step is to rebuild the Rails app around this.
|
22
|
+
|
23
|
+
Users
|
24
|
+
Subscriptions
|
25
|
+
Schedule
|
26
|
+
|
27
|
+
What's the most conservative way of doing this?
|
28
|
+
|
29
|
+
Keep MySQL and the schema,
|
30
|
+
|
31
|
+
Payload generator is a discrete part. Refactor this. and keep the middle
|
32
|
+
|
33
|
+
bin/payload_generator2 is not used any more. Feeds are fetched in
|
34
|
+
lib/feed_couch.rb #refresh method. It doesn't seem this refresh method
|
35
|
+
can handle https
|
36
|
+
|
37
|
+
RelaxedPayloadGenerator is the key class used by Delivery.
|
38
|
+
|
39
|
+
Payload Specification can be turned into a TABLE and Model. Or the
|
40
|
+
fields can be added to delivery as discrete fields (instead of blob);
|
41
|
+
|
42
|
+
TODO.
|
43
|
+
- Check ETag when updating feed.
|
44
|
+
- private password feeds as user requested
|
45
|
+
- NEW MOBI generation; periodical format
|
46
|
+
- migrate feeds from MySQL to Postgresql
|
47
|
+
- check new feed creation
|
48
|
+
- clean up rails; move to rails 3; put into git; simplify deployment
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
------------------------------------------------------------------------
|
53
|
+
Sat Jun 25 23:40:52 EDT 2011
|
54
|
+
|
55
|
+
|
56
|
+
postgresql functions
|
57
|
+
curval()
|
58
|
+
setval()
|
59
|
+
|
60
|
+
I can perhaps sync the feed_id of this table with feeds.id in the rails
|
61
|
+
database?
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
|
data/README.markdown
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Feedbase
|
2
|
+
|
3
|
+
Better Instructions forthcoming
|
4
|
+
|
5
|
+
1. Create a PostgreSQL database called feeds.
|
6
|
+
2. Load database script db/create.sql.
|
7
|
+
|
8
|
+
API
|
9
|
+
|
10
|
+
|
11
|
+
Feedbase::Feed[feed_url: feed_url] || Feedbase::Feed.create(feed_url: feed_url)
|
12
|
+
|
13
|
+
== Instance methods and attributes
|
14
|
+
|
15
|
+
Feedbase::Feed#refresh
|
16
|
+
|
17
|
+
Feedbase::Feed#title
|
18
|
+
Feedbase::Feed#feed_url
|
19
|
+
Feedbase::Feed#items
|
20
|
+
|
21
|
+
|
22
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'bundler'
|
4
|
+
Bundler::GemHelper.install_tasks
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), 'lib')
|
7
|
+
|
8
|
+
desc "Run tests"
|
9
|
+
task :test do
|
10
|
+
$:.unshift File.expand_path("test")
|
11
|
+
require 'test_helper'
|
12
|
+
Dir.chdir("test") do
|
13
|
+
Dir['*_test.rb'].each do |x|
|
14
|
+
puts "requiring #{x}"
|
15
|
+
require x
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
MiniTest::Unit.autorun
|
20
|
+
end
|
21
|
+
|
22
|
+
task :default => :test
|
23
|
+
|
data/db/create.sql
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
-- this is for postgresql
|
2
|
+
drop table if exists feeds CASCADE;
|
3
|
+
drop table if exists feed_downloads CASCADE;
|
4
|
+
drop table if exists items CASCADE;
|
5
|
+
|
6
|
+
create table feeds (
|
7
|
+
feed_id serial primary key,
|
8
|
+
feed_url varchar UNIQUE NOT NULL,
|
9
|
+
title varchar,
|
10
|
+
alpha_title varchar,
|
11
|
+
subtitle varchar,
|
12
|
+
web_url varchar,
|
13
|
+
favicon_url varchar,
|
14
|
+
subscribers integer default 0,
|
15
|
+
created timestamp default now()
|
16
|
+
);
|
17
|
+
|
18
|
+
create table feed_downloads (
|
19
|
+
feed_download_id serial primary key,
|
20
|
+
feed_id integer REFERENCES feeds (feed_id) ON DELETE CASCADE,
|
21
|
+
download_time float,
|
22
|
+
headers text,
|
23
|
+
encoding varchar,
|
24
|
+
etag varchar,
|
25
|
+
last_modified timestamp,
|
26
|
+
created timestamp default now()
|
27
|
+
);
|
28
|
+
|
29
|
+
create table items (
|
30
|
+
item_id serial primary key,
|
31
|
+
feed_id integer REFERENCES feeds (feed_id) ON DELETE CASCADE,
|
32
|
+
guid varchar UNIQUE NOT NULL,
|
33
|
+
title varchar,
|
34
|
+
link varchar,
|
35
|
+
content text,
|
36
|
+
author varchar,
|
37
|
+
word_count integer,
|
38
|
+
pub_date timestamp default now()
|
39
|
+
);
|
40
|
+
|
data/db/setup.sh
ADDED
data/feedbase.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "feedbase"
|
6
|
+
s.version = '0.0.1'
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.required_ruby_version = '>= 1.9.0'
|
9
|
+
|
10
|
+
s.authors = ["Daniel Choi"]
|
11
|
+
s.email = ["dhchoi@gmail.com"]
|
12
|
+
s.homepage = ""
|
13
|
+
s.summary = %q{A feed aggregator database}
|
14
|
+
s.description = %q{}
|
15
|
+
|
16
|
+
s.rubyforge_project = "feedbase"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
s.add_dependency 'sequel'
|
24
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'feedbase/fetch_feed'
|
3
|
+
DB = Sequel.connect 'postgres:///feeds'
|
4
|
+
|
5
|
+
module Feedbase
|
6
|
+
|
7
|
+
class Redirected < StandardError; end
|
8
|
+
|
9
|
+
class Feed < Sequel::Model
|
10
|
+
one_to_many :items
|
11
|
+
one_to_many :feed_downloads
|
12
|
+
|
13
|
+
# returns number of items created
|
14
|
+
def refresh(force=false)
|
15
|
+
# check headers and etag and last modified
|
16
|
+
raise "Missing feed_url" if feed_url.nil?
|
17
|
+
ff = Feedbase::FetchFeed.new(feed_url)
|
18
|
+
headers = ff.headers
|
19
|
+
if !force
|
20
|
+
if last_etag && (headers[:etag] == last_etag)
|
21
|
+
puts "-- #{feed_url} -- ETag cache hit"
|
22
|
+
return
|
23
|
+
end
|
24
|
+
end
|
25
|
+
data = ff.fetch
|
26
|
+
params = data[:feed_params].merge(:alpha_title => make_alpha_title(data[:feed_params][:title]))
|
27
|
+
if params[:feed_url] != self[:feed_url]
|
28
|
+
if x = self.class.filter(:feed_url => params[:feed_url]).first
|
29
|
+
raise Redirected.new("Redirected to existing feed: #{x.feed_url}")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
params.delete(:feed_url)
|
33
|
+
begin Sequel::DatabaseError
|
34
|
+
update params
|
35
|
+
rescue StandardError # PGError
|
36
|
+
puts "The offending record is #{self.inspect}"
|
37
|
+
raise
|
38
|
+
end
|
39
|
+
|
40
|
+
Feedbase::FeedDownload.create({feed_id: feed_id}.merge(data[:download_params]))
|
41
|
+
items_created = data[:items].
|
42
|
+
select {|item| Feedbase::Item[:guid => item[:guid]].nil?}.
|
43
|
+
map { |item|
|
44
|
+
params = {
|
45
|
+
feed_id: feed_id,
|
46
|
+
title: item[:title].encode("utf-8"),
|
47
|
+
guid: item[:guid],
|
48
|
+
link: item[:link],
|
49
|
+
content: item[:content],
|
50
|
+
author: item[:author],
|
51
|
+
word_count: item[:word_count],
|
52
|
+
pub_date: item[:pub_date]
|
53
|
+
}
|
54
|
+
Feedbase::Item.create params
|
55
|
+
}
|
56
|
+
# caller can extract an item count from this
|
57
|
+
items_created
|
58
|
+
end
|
59
|
+
|
60
|
+
def last_download
|
61
|
+
@last_download ||= FeedDownload.filter(feed_id: feed_id).first
|
62
|
+
end
|
63
|
+
|
64
|
+
def last_etag
|
65
|
+
last_download && last_download.etag
|
66
|
+
end
|
67
|
+
|
68
|
+
def last_modified
|
69
|
+
last_download && last_download.last_modified
|
70
|
+
end
|
71
|
+
|
72
|
+
def make_alpha_title(s)
|
73
|
+
return if s.nil?
|
74
|
+
s.gsub(/^(The|A|An)\s/, '')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class Item < Sequel::Model
|
79
|
+
many_to_one :feed
|
80
|
+
end
|
81
|
+
|
82
|
+
class FeedDownload < Sequel::Model
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
if __FILE__ == $0
|
88
|
+
feed = Feedbase::Feed[feed_url: ARGV.first] || Feedbase::Feed.create(feed_url: ARGV.first)
|
89
|
+
puts feed
|
90
|
+
puts feed.refresh
|
91
|
+
end
|
92
|
+
|
@@ -0,0 +1,153 @@
|
|
1
|
+
require 'rexml/streamlistener'
|
2
|
+
require 'rexml/document'
|
3
|
+
require 'pp'
|
4
|
+
require 'iconv'
|
5
|
+
require 'feedbase/html_simplifier'
|
6
|
+
require 'date'
|
7
|
+
|
8
|
+
module Feedbase
|
9
|
+
|
10
|
+
class FeedParser
|
11
|
+
# Try to have the XML in UTF-8 when you call this.
|
12
|
+
def initialize(xml)
|
13
|
+
@xml = xml
|
14
|
+
@listener = FeedListener.new
|
15
|
+
REXML::Document.parse_stream(@xml, @listener)
|
16
|
+
end
|
17
|
+
|
18
|
+
def result
|
19
|
+
tidy(@listener.result)
|
20
|
+
end
|
21
|
+
|
22
|
+
def tidy(feed)
|
23
|
+
feed[:items] = feed[:items].map do |item|
|
24
|
+
body = item[:content] || item[:summary] || ""
|
25
|
+
new_body = HtmlSimplifier.new(body, "utf-8").result.
|
26
|
+
gsub(%r{<p>(\n|<br/>)+</p>}, '').
|
27
|
+
strip + "\n\n"
|
28
|
+
item.delete(:summary)
|
29
|
+
item[:content] = new_body
|
30
|
+
item[:word_count] = word_count(new_body)
|
31
|
+
item
|
32
|
+
end
|
33
|
+
feed
|
34
|
+
end
|
35
|
+
|
36
|
+
def word_count(string)
|
37
|
+
string.gsub(%{</?[^>]+>}, '').split(/\s+/).size
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class FeedListener
|
42
|
+
include REXML::StreamListener
|
43
|
+
|
44
|
+
FEED_TITLE_TAGS = %w[ feed/title rss/channel/title rdf:RDF/channel/title ]
|
45
|
+
|
46
|
+
FEED_LINK_TAGS = %w[ rss/channel/link rdf:RDF/channel/link ]
|
47
|
+
|
48
|
+
ITEM_START_TAGS = %w[ feed/entry rss/channel/item rdf:RDF/item ]
|
49
|
+
|
50
|
+
ITEM_TITLE_TAGS = %w[ feed/entry/title rss/channel/item/title rdf:RDF/item/title ]
|
51
|
+
|
52
|
+
ITEM_AUTHOR_TAGS = %w[ feed/entry/author/name rss/channel/item/author rdf:RDF/item/dc:creator ]
|
53
|
+
|
54
|
+
ITEM_GUID_TAGS = %w[ feed/entry/id rss/channel/item/guid rdf:RDF/item/guid rdf:RDF/item/feedburner:origLink ]
|
55
|
+
|
56
|
+
ITEM_PUB_DATE_TAGS = %w[ feed/entry/published feed/entry/created feed/entry/modified rss/channel/item/pubDate rdf:RDF/item/dc:date ]
|
57
|
+
|
58
|
+
ITEM_LINK_TAGS = %w[ rss/channel/item/link rdf:RDF/item/link ]
|
59
|
+
|
60
|
+
ITEM_SUMMARY_TAGS = %w[ feed/entry/summary rss/channel/item/description rdf:RDF/item/description ]
|
61
|
+
ITEM_CONTENT_TAGS = [ %r{feed/entry/content}, %r{rss/channel/item/content}, %r{rss/channel/item/content:encoded}, %r{rss/item/content}, %r{rdf:RDF/item/content} ]
|
62
|
+
|
63
|
+
def initialize
|
64
|
+
@nested_tags = []
|
65
|
+
@x = {:items => []}
|
66
|
+
end
|
67
|
+
|
68
|
+
def result; @x; end
|
69
|
+
|
70
|
+
def tag_start(name, attrs)
|
71
|
+
@nested_tags.push name
|
72
|
+
case path
|
73
|
+
when 'feed/link'
|
74
|
+
@x[:link] = encode attrs['href']
|
75
|
+
when *ITEM_START_TAGS
|
76
|
+
@current_item = {}
|
77
|
+
when 'feed/entry/link'
|
78
|
+
@current_item[:link] = encode attrs['href']
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def tag_end(name)
|
83
|
+
case path
|
84
|
+
when *ITEM_START_TAGS
|
85
|
+
@x[:items] << @current_item
|
86
|
+
@current_item = nil
|
87
|
+
end
|
88
|
+
@nested_tags.pop
|
89
|
+
end
|
90
|
+
|
91
|
+
def text(text)
|
92
|
+
case path
|
93
|
+
when *FEED_TITLE_TAGS
|
94
|
+
@x[:title] = encode text.strip
|
95
|
+
when *FEED_LINK_TAGS
|
96
|
+
@x[:link] = encode text.strip
|
97
|
+
when *ITEM_TITLE_TAGS
|
98
|
+
@current_item[:title] = encode(text.strip)
|
99
|
+
when *ITEM_AUTHOR_TAGS
|
100
|
+
@current_item[:author] = encode(text.strip)
|
101
|
+
when *ITEM_GUID_TAGS
|
102
|
+
@current_item[:guid] = encode(text)
|
103
|
+
when *ITEM_PUB_DATE_TAGS
|
104
|
+
@current_item[:pub_date] = DateTime.parse(encode(text))
|
105
|
+
when *ITEM_LINK_TAGS
|
106
|
+
@current_item[:link] = encode(text)
|
107
|
+
when *ITEM_SUMMARY_TAGS
|
108
|
+
if @current_item[:summary]
|
109
|
+
@current_item[:summary] << encode(text)
|
110
|
+
else
|
111
|
+
@current_item[:summary] = encode(text)
|
112
|
+
end
|
113
|
+
when *ITEM_CONTENT_TAGS
|
114
|
+
if @current_item[:content]
|
115
|
+
@current_item[:content] << encode(text)
|
116
|
+
else
|
117
|
+
@current_item[:content] = encode(text)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
alias_method :cdata, :text
|
122
|
+
|
123
|
+
def xmldecl(decl, encoding, extra)
|
124
|
+
if encoding
|
125
|
+
@x[:orig_encoding] = encoding.downcase
|
126
|
+
else
|
127
|
+
@x[:orig_encoding] = "UTF-8"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
def path
|
133
|
+
@nested_tags.join('/')
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# encoding method
|
138
|
+
def encode(string)
|
139
|
+
string
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
if __FILE__ == $0
|
147
|
+
feeds = ARGV
|
148
|
+
feeds.each do |feed|
|
149
|
+
xml = File.read feed
|
150
|
+
f = Feedbase::FeedParser.new(xml)
|
151
|
+
pp f.result
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'feedbase/feed_parser'
|
2
|
+
require 'timeout'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
module Feedbase
|
6
|
+
class FetchFeed
|
7
|
+
|
8
|
+
attr_accessor :feed_url
|
9
|
+
|
10
|
+
def initialize(feed_url)
|
11
|
+
@feed_url = feed_url
|
12
|
+
end
|
13
|
+
|
14
|
+
def headers
|
15
|
+
if @headers
|
16
|
+
return @headers
|
17
|
+
end
|
18
|
+
_headers = begin
|
19
|
+
Timeout::timeout(20) do
|
20
|
+
agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3"
|
21
|
+
# get headers and any redirects
|
22
|
+
res = `curl -sIL -A'#{agent}' '#{feed_url}'`.gsub("\r\n", "\n")
|
23
|
+
if res !~ /^HTTP.*200 OK$/
|
24
|
+
puts res.inspect
|
25
|
+
raise "Response not OK"
|
26
|
+
end
|
27
|
+
res
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#TODO check for xml
|
32
|
+
@headers = { headers: _headers,
|
33
|
+
encoding: _headers[/^Content-Type:.*charset=(.*)$/i, 1],
|
34
|
+
etag: _headers[/^ETag: (.*)$/,1],
|
35
|
+
last_modified: ((x = _headers[/Last-Modified: (.*)/, 1]) && DateTime.parse(x)) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def fetch
|
39
|
+
url = fix_url(feed_url)
|
40
|
+
start_time = Time.now
|
41
|
+
result = begin
|
42
|
+
Timeout::timeout(20) do
|
43
|
+
agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3"
|
44
|
+
headers
|
45
|
+
# get headers and any redirects
|
46
|
+
`curl -sL -A'#{agent}' '#{url}'`
|
47
|
+
end
|
48
|
+
rescue StandardError, Timeout::Error => ex
|
49
|
+
raise
|
50
|
+
end
|
51
|
+
elapsed = Time.now - start_time
|
52
|
+
if !(x = headers[:headers].scan(/^Location: (.*)$/).flatten).empty?
|
53
|
+
#puts "Redirected to #{x.last}"
|
54
|
+
feed_url = x.last
|
55
|
+
end
|
56
|
+
result2 = Iconv.conv("UTF-8//TRANSLIT//IGNORE", (headers[:encoding] || 'iso-8859-1'), result)
|
57
|
+
f = FeedParser.new(result2).result
|
58
|
+
feed_params = {:feed_url => feed_url, :title => f[:title], :web_url => f[:link]}
|
59
|
+
items = f[:items]
|
60
|
+
|
61
|
+
{ feed_params: feed_params,
|
62
|
+
items: f[:items],
|
63
|
+
download_params: headers.merge(download_time: elapsed) }
|
64
|
+
end
|
65
|
+
|
66
|
+
def fix_url(url)
|
67
|
+
unless url =~ /^https?:\/\//
|
68
|
+
url = "http://" + url
|
69
|
+
end
|
70
|
+
url
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
if __FILE__ == $0
|
78
|
+
puts Feedbase::FetchFeed.new(ARGV.first).fetch
|
79
|
+
end
|
80
|
+
|
81
|
+
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env ruby19
|
2
|
+
$:.unshift(File.dirname(__FILE__) + "/../lib")
|
3
|
+
|
4
|
+
# Takes output of feed_file_generator.rb encoded in INPUT_ENCODING as input and
|
5
|
+
# strips superfluous markup from the feed item bodies.
|
6
|
+
|
7
|
+
#require 'feed_file_generator'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'rexml/streamlistener'
|
10
|
+
require 'rexml/document'
|
11
|
+
require 'open3'
|
12
|
+
|
13
|
+
# NOTE requires the htmltidy program
|
14
|
+
# http://tidy.sourceforge.net/docs/Overview.html
|
15
|
+
|
16
|
+
INPUT_ENCODING = 'utf-8'
|
17
|
+
|
18
|
+
module Feedbase
|
19
|
+
class HtmlSimplifier
|
20
|
+
include FileUtils::Verbose
|
21
|
+
attr :result
|
22
|
+
|
23
|
+
# Takes feed data as hash. Generate this with FeedParser
|
24
|
+
def initialize(html, orig_encoding)
|
25
|
+
@orig_encoding = orig_encoding
|
26
|
+
@xml = tidy(pre_cleanup(html))
|
27
|
+
@result = parse.gsub(/<http[^>]+>/, "")
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse
|
31
|
+
@listener = FeedHtmlListener.new
|
32
|
+
REXML::Document.parse_stream(@xml, @listener)
|
33
|
+
@listener.result + "\n\n"
|
34
|
+
end
|
35
|
+
|
36
|
+
def pre_cleanup(html)
|
37
|
+
html.gsub!("<o:p></o:p>", "")
|
38
|
+
html
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.tidy(html, orig_encoding)
|
42
|
+
# assumes input encoding of latin 1
|
43
|
+
#output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr|
|
44
|
+
#output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
|
45
|
+
#output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1
|
46
|
+
|
47
|
+
tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null"
|
48
|
+
output = IO.popen(tidy, "r+") do |pipe|
|
49
|
+
input = <<-END
|
50
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
51
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
52
|
+
<head><title></title></head><body>#{html}</body></html>
|
53
|
+
END
|
54
|
+
pipe.puts input
|
55
|
+
pipe.close_write
|
56
|
+
#$stderr.puts stderr.read
|
57
|
+
pipe.read
|
58
|
+
end
|
59
|
+
output
|
60
|
+
end
|
61
|
+
|
62
|
+
def tidy(html)
|
63
|
+
self.class.tidy html, @orig_encoding
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
class FeedHtmlListener
|
69
|
+
include REXML::StreamListener
|
70
|
+
|
71
|
+
STRIP_TAGS = %w[ body font ]
|
72
|
+
BLOCK_TAGS = %w[ p div ]
|
73
|
+
HEADER_TAGS = %w[ h1 h2 h3 h4 h5 h6 ]
|
74
|
+
|
75
|
+
UNIFORM_HEADER_TAG = "h4"
|
76
|
+
|
77
|
+
def initialize
|
78
|
+
@nested_tags = []
|
79
|
+
@content = [""]
|
80
|
+
end
|
81
|
+
|
82
|
+
def result
|
83
|
+
# we call strip_empty_tags twice to catch empty tags nested in a tag like <p>
|
84
|
+
# not full-proof but good enough for now
|
85
|
+
x = @content.map {|line| strip_empty_tags( strip_empty_tags( line ).strip ) }.
|
86
|
+
select {|line| line != ""}.compact.join("\n\n")
|
87
|
+
end
|
88
|
+
|
89
|
+
def strip_empty_tags(line)
|
90
|
+
line.gsub(%r{<(\w+)[^>]*>\s*</\1>}, '')
|
91
|
+
end
|
92
|
+
|
93
|
+
def tag_start(name, attrs)
|
94
|
+
@nested_tags.push name
|
95
|
+
case name
|
96
|
+
when 'a'
|
97
|
+
# effectively strips out all style tags
|
98
|
+
@content[-1] << "<a href='#{attrs['href']}'>"
|
99
|
+
when 'img'
|
100
|
+
if attrs['alt']
|
101
|
+
text = (attrs['alt'].strip == '') ? 'image ' : "image:#{attrs['alt']} "
|
102
|
+
@content[-1] << text
|
103
|
+
end
|
104
|
+
when *HEADER_TAGS
|
105
|
+
@content << "<#{UNIFORM_HEADER_TAG}>"
|
106
|
+
when 'br' #skip
|
107
|
+
@content << "<br/>"
|
108
|
+
when 'blockquote'
|
109
|
+
@content << "<blockquote>"
|
110
|
+
when 'ul', 'ol', 'dl'
|
111
|
+
@content << "<#{name}>"
|
112
|
+
when 'li', 'dt', 'dd'
|
113
|
+
@content[-1] << " <#{name}>"
|
114
|
+
when 'strong', 'em'
|
115
|
+
@content[-1] << "<#{name}>"
|
116
|
+
when *BLOCK_TAGS
|
117
|
+
@content << "<p>"
|
118
|
+
when 'pre'
|
119
|
+
@content << "<pre>"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def tag_end(name)
|
124
|
+
@nested_tags.pop
|
125
|
+
case name
|
126
|
+
when 'a'
|
127
|
+
@content[-1] << "</a>"
|
128
|
+
when *HEADER_TAGS
|
129
|
+
@content[-1] << "</#{UNIFORM_HEADER_TAG}>"
|
130
|
+
when 'blockquote'
|
131
|
+
@content << '</blockquote>'
|
132
|
+
when 'ul', 'ol', 'dl'
|
133
|
+
@content[-1] << "</#{name}>"
|
134
|
+
when 'li', 'dt', 'dd'
|
135
|
+
@content[-1] << " </#{name}>"
|
136
|
+
when 'strong', 'em'
|
137
|
+
@content[-1] << "</#{name}>"
|
138
|
+
when *BLOCK_TAGS
|
139
|
+
@content[-1] << "</p>"
|
140
|
+
when 'pre'
|
141
|
+
@content[-1] << "</pre>"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def text(text)
|
146
|
+
return if text =~ /\a\s*\Z/
|
147
|
+
|
148
|
+
# probably slow, but ok for now
|
149
|
+
@content[-1] << text
|
150
|
+
end
|
151
|
+
|
152
|
+
def start_of_block?
|
153
|
+
BLOCK_TAGS.include? @nested_tags[-1]
|
154
|
+
end
|
155
|
+
|
156
|
+
def path
|
157
|
+
@nested_tags.join('/')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def word_count(string)
|
162
|
+
string.gsub(%{</?[^>]+>}, '').split(/\s+/).size
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
data/lib/feedbase.rb
ADDED
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feedbase
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Daniel Choi
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-07-31 00:00:00 -04:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: sequel
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
description: ""
|
28
|
+
email:
|
29
|
+
- dhchoi@gmail.com
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- NOTES
|
39
|
+
- README.markdown
|
40
|
+
- Rakefile
|
41
|
+
- db/create.sql
|
42
|
+
- db/setup.sh
|
43
|
+
- feedbase.gemspec
|
44
|
+
- lib/feedbase.rb
|
45
|
+
- lib/feedbase/feed.rb
|
46
|
+
- lib/feedbase/feed_parser.rb
|
47
|
+
- lib/feedbase/fetch_feed.rb
|
48
|
+
- lib/feedbase/html_simplifier.rb
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: ""
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 1.9.0
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: feedbase
|
73
|
+
rubygems_version: 1.6.1
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: A feed aggregator database
|
77
|
+
test_files: []
|
78
|
+
|