feedbase 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/NOTES +65 -0
- data/README.markdown +22 -0
- data/Rakefile +23 -0
- data/db/create.sql +40 -0
- data/db/setup.sh +3 -0
- data/feedbase.gemspec +24 -0
- data/lib/feedbase/feed.rb +92 -0
- data/lib/feedbase/feed_parser.rb +153 -0
- data/lib/feedbase/fetch_feed.rb +81 -0
- data/lib/feedbase/html_simplifier.rb +165 -0
- data/lib/feedbase.rb +7 -0
- metadata +78 -0
data/.gitignore
ADDED
data/NOTES
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
------------------------------------------------------------------------
|
2
|
+
Fri Jun 24 08:57:18 EDT 2011
|
3
|
+
|
4
|
+
This will be the feed aggregator engine for Kindlefeeder, but also
|
5
|
+
usable for other projects.
|
6
|
+
|
7
|
+
PostgreSQL
|
8
|
+
|
9
|
+
db/create.sql
|
10
|
+
|
11
|
+
Create a database called 'feeds'
|
12
|
+
|
13
|
+
? rchardet https://rubygems.org/gems/edouard-rchardet
|
14
|
+
|
15
|
+
API
|
16
|
+
|
17
|
+
FetchFeed.from_url [feed-url]
|
18
|
+
|
19
|
+
takes care of getting the feed, parsing, and storing in postgres
|
20
|
+
|
21
|
+
Next step is to rebuild the Rails app around this.
|
22
|
+
|
23
|
+
Users
|
24
|
+
Subscriptions
|
25
|
+
Schedule
|
26
|
+
|
27
|
+
What's the most conservative way of doing this?
|
28
|
+
|
29
|
+
Keep MySQL and the schema,
|
30
|
+
|
31
|
+
Payload generator is a discrete part. Refactor this. and keep the middle
|
32
|
+
|
33
|
+
bin/payload_generator2 is not used any more. Feeds are fetched in
|
34
|
+
lib/feed_couch.rb #refresh method. It doesn't seem this refresh method
|
35
|
+
can handle https
|
36
|
+
|
37
|
+
RelaxedPayloadGenerator is the key class used by Delivery.
|
38
|
+
|
39
|
+
Payload Specification can be turned into a TABLE and Model. Or the
|
40
|
+
fields can be added to delivery as discrete fields (instead of blob);
|
41
|
+
|
42
|
+
TODO.
|
43
|
+
- Check ETag when updating feed.
|
44
|
+
- private password feeds as user requested
|
45
|
+
- NEW MOBI generation; periodical format
|
46
|
+
- migrate feeds from MySQL to Postgresql
|
47
|
+
- check new feed creation
|
48
|
+
- clean up rails; move to rails 3; put into git; simplify deployment
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
------------------------------------------------------------------------
|
53
|
+
Sat Jun 25 23:40:52 EDT 2011
|
54
|
+
|
55
|
+
|
56
|
+
postgresql functions
|
57
|
+
curval()
|
58
|
+
setval()
|
59
|
+
|
60
|
+
I can perhaps sync the feed_id of this table with feeds.id in the rails
|
61
|
+
database?
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
|
data/README.markdown
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Feedbase
|
2
|
+
|
3
|
+
Better Instructions forthcoming
|
4
|
+
|
5
|
+
1. Create a PostgreSQL database called feeds.
|
6
|
+
2. Load database script db/create.sql.
|
7
|
+
|
8
|
+
API
|
9
|
+
|
10
|
+
|
11
|
+
Feedbase::Feed[feed_url: feed_url] || Feedbase::Feed.create(feed_url: feed_url)
|
12
|
+
|
13
|
+
== Instance methods and attributes
|
14
|
+
|
15
|
+
Feedbase::Feed#refresh
|
16
|
+
|
17
|
+
Feedbase::Feed#title
|
18
|
+
Feedbase::Feed#feed_url
|
19
|
+
Feedbase::Feed#items
|
20
|
+
|
21
|
+
|
22
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'bundler'
|
4
|
+
Bundler::GemHelper.install_tasks
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), 'lib')
|
7
|
+
|
8
|
+
desc "Run tests"
|
9
|
+
task :test do
|
10
|
+
$:.unshift File.expand_path("test")
|
11
|
+
require 'test_helper'
|
12
|
+
Dir.chdir("test") do
|
13
|
+
Dir['*_test.rb'].each do |x|
|
14
|
+
puts "requiring #{x}"
|
15
|
+
require x
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
MiniTest::Unit.autorun
|
20
|
+
end
|
21
|
+
|
22
|
+
task :default => :test
|
23
|
+
|
data/db/create.sql
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
-- this is for postgresql
|
2
|
+
drop table if exists feeds CASCADE;
|
3
|
+
drop table if exists feed_downloads CASCADE;
|
4
|
+
drop table if exists items CASCADE;
|
5
|
+
|
6
|
+
create table feeds (
|
7
|
+
feed_id serial primary key,
|
8
|
+
feed_url varchar UNIQUE NOT NULL,
|
9
|
+
title varchar,
|
10
|
+
alpha_title varchar,
|
11
|
+
subtitle varchar,
|
12
|
+
web_url varchar,
|
13
|
+
favicon_url varchar,
|
14
|
+
subscribers integer default 0,
|
15
|
+
created timestamp default now()
|
16
|
+
);
|
17
|
+
|
18
|
+
create table feed_downloads (
|
19
|
+
feed_download_id serial primary key,
|
20
|
+
feed_id integer REFERENCES feeds (feed_id) ON DELETE CASCADE,
|
21
|
+
download_time float,
|
22
|
+
headers text,
|
23
|
+
encoding varchar,
|
24
|
+
etag varchar,
|
25
|
+
last_modified timestamp,
|
26
|
+
created timestamp default now()
|
27
|
+
);
|
28
|
+
|
29
|
+
create table items (
|
30
|
+
item_id serial primary key,
|
31
|
+
feed_id integer REFERENCES feeds (feed_id) ON DELETE CASCADE,
|
32
|
+
guid varchar UNIQUE NOT NULL,
|
33
|
+
title varchar,
|
34
|
+
link varchar,
|
35
|
+
content text,
|
36
|
+
author varchar,
|
37
|
+
word_count integer,
|
38
|
+
pub_date timestamp default now()
|
39
|
+
);
|
40
|
+
|
data/db/setup.sh
ADDED
data/feedbase.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "feedbase"
|
6
|
+
s.version = '0.0.1'
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.required_ruby_version = '>= 1.9.0'
|
9
|
+
|
10
|
+
s.authors = ["Daniel Choi"]
|
11
|
+
s.email = ["dhchoi@gmail.com"]
|
12
|
+
s.homepage = ""
|
13
|
+
s.summary = %q{A feed aggregator database}
|
14
|
+
s.description = %q{}
|
15
|
+
|
16
|
+
s.rubyforge_project = "feedbase"
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
s.add_dependency 'sequel'
|
24
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'feedbase/fetch_feed'
|
3
|
+
DB = Sequel.connect 'postgres:///feeds'
|
4
|
+
|
5
|
+
module Feedbase
|
6
|
+
|
7
|
+
class Redirected < StandardError; end
|
8
|
+
|
9
|
+
class Feed < Sequel::Model
|
10
|
+
one_to_many :items
|
11
|
+
one_to_many :feed_downloads
|
12
|
+
|
13
|
+
# returns number of items created
|
14
|
+
def refresh(force=false)
|
15
|
+
# check headers and etag and last modified
|
16
|
+
raise "Missing feed_url" if feed_url.nil?
|
17
|
+
ff = Feedbase::FetchFeed.new(feed_url)
|
18
|
+
headers = ff.headers
|
19
|
+
if !force
|
20
|
+
if last_etag && (headers[:etag] == last_etag)
|
21
|
+
puts "-- #{feed_url} -- ETag cache hit"
|
22
|
+
return
|
23
|
+
end
|
24
|
+
end
|
25
|
+
data = ff.fetch
|
26
|
+
params = data[:feed_params].merge(:alpha_title => make_alpha_title(data[:feed_params][:title]))
|
27
|
+
if params[:feed_url] != self[:feed_url]
|
28
|
+
if x = self.class.filter(:feed_url => params[:feed_url]).first
|
29
|
+
raise Redirected.new("Redirected to existing feed: #{x.feed_url}")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
params.delete(:feed_url)
|
33
|
+
begin Sequel::DatabaseError
|
34
|
+
update params
|
35
|
+
rescue StandardError # PGError
|
36
|
+
puts "The offending record is #{self.inspect}"
|
37
|
+
raise
|
38
|
+
end
|
39
|
+
|
40
|
+
Feedbase::FeedDownload.create({feed_id: feed_id}.merge(data[:download_params]))
|
41
|
+
items_created = data[:items].
|
42
|
+
select {|item| Feedbase::Item[:guid => item[:guid]].nil?}.
|
43
|
+
map { |item|
|
44
|
+
params = {
|
45
|
+
feed_id: feed_id,
|
46
|
+
title: item[:title].encode("utf-8"),
|
47
|
+
guid: item[:guid],
|
48
|
+
link: item[:link],
|
49
|
+
content: item[:content],
|
50
|
+
author: item[:author],
|
51
|
+
word_count: item[:word_count],
|
52
|
+
pub_date: item[:pub_date]
|
53
|
+
}
|
54
|
+
Feedbase::Item.create params
|
55
|
+
}
|
56
|
+
# caller can extract an item count from this
|
57
|
+
items_created
|
58
|
+
end
|
59
|
+
|
60
|
+
def last_download
|
61
|
+
@last_download ||= FeedDownload.filter(feed_id: feed_id).first
|
62
|
+
end
|
63
|
+
|
64
|
+
def last_etag
|
65
|
+
last_download && last_download.etag
|
66
|
+
end
|
67
|
+
|
68
|
+
def last_modified
|
69
|
+
last_download && last_download.last_modified
|
70
|
+
end
|
71
|
+
|
72
|
+
def make_alpha_title(s)
|
73
|
+
return if s.nil?
|
74
|
+
s.gsub(/^(The|A|An)\s/, '')
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class Item < Sequel::Model
|
79
|
+
many_to_one :feed
|
80
|
+
end
|
81
|
+
|
82
|
+
class FeedDownload < Sequel::Model
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
if __FILE__ == $0
|
88
|
+
feed = Feedbase::Feed[feed_url: ARGV.first] || Feedbase::Feed.create(feed_url: ARGV.first)
|
89
|
+
puts feed
|
90
|
+
puts feed.refresh
|
91
|
+
end
|
92
|
+
|
@@ -0,0 +1,153 @@
|
|
1
|
+
require 'rexml/streamlistener'
|
2
|
+
require 'rexml/document'
|
3
|
+
require 'pp'
|
4
|
+
require 'iconv'
|
5
|
+
require 'feedbase/html_simplifier'
|
6
|
+
require 'date'
|
7
|
+
|
8
|
+
module Feedbase
|
9
|
+
|
10
|
+
class FeedParser
|
11
|
+
# Try to have the XML in UTF-8 when you call this.
|
12
|
+
def initialize(xml)
|
13
|
+
@xml = xml
|
14
|
+
@listener = FeedListener.new
|
15
|
+
REXML::Document.parse_stream(@xml, @listener)
|
16
|
+
end
|
17
|
+
|
18
|
+
def result
|
19
|
+
tidy(@listener.result)
|
20
|
+
end
|
21
|
+
|
22
|
+
def tidy(feed)
|
23
|
+
feed[:items] = feed[:items].map do |item|
|
24
|
+
body = item[:content] || item[:summary] || ""
|
25
|
+
new_body = HtmlSimplifier.new(body, "utf-8").result.
|
26
|
+
gsub(%r{<p>(\n|<br/>)+</p>}, '').
|
27
|
+
strip + "\n\n"
|
28
|
+
item.delete(:summary)
|
29
|
+
item[:content] = new_body
|
30
|
+
item[:word_count] = word_count(new_body)
|
31
|
+
item
|
32
|
+
end
|
33
|
+
feed
|
34
|
+
end
|
35
|
+
|
36
|
+
def word_count(string)
|
37
|
+
string.gsub(%{</?[^>]+>}, '').split(/\s+/).size
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class FeedListener
|
42
|
+
include REXML::StreamListener
|
43
|
+
|
44
|
+
FEED_TITLE_TAGS = %w[ feed/title rss/channel/title rdf:RDF/channel/title ]
|
45
|
+
|
46
|
+
FEED_LINK_TAGS = %w[ rss/channel/link rdf:RDF/channel/link ]
|
47
|
+
|
48
|
+
ITEM_START_TAGS = %w[ feed/entry rss/channel/item rdf:RDF/item ]
|
49
|
+
|
50
|
+
ITEM_TITLE_TAGS = %w[ feed/entry/title rss/channel/item/title rdf:RDF/item/title ]
|
51
|
+
|
52
|
+
ITEM_AUTHOR_TAGS = %w[ feed/entry/author/name rss/channel/item/author rdf:RDF/item/dc:creator ]
|
53
|
+
|
54
|
+
ITEM_GUID_TAGS = %w[ feed/entry/id rss/channel/item/guid rdf:RDF/item/guid rdf:RDF/item/feedburner:origLink ]
|
55
|
+
|
56
|
+
ITEM_PUB_DATE_TAGS = %w[ feed/entry/published feed/entry/created feed/entry/modified rss/channel/item/pubDate rdf:RDF/item/dc:date ]
|
57
|
+
|
58
|
+
ITEM_LINK_TAGS = %w[ rss/channel/item/link rdf:RDF/item/link ]
|
59
|
+
|
60
|
+
ITEM_SUMMARY_TAGS = %w[ feed/entry/summary rss/channel/item/description rdf:RDF/item/description ]
|
61
|
+
ITEM_CONTENT_TAGS = [ %r{feed/entry/content}, %r{rss/channel/item/content}, %r{rss/channel/item/content:encoded}, %r{rss/item/content}, %r{rdf:RDF/item/content} ]
|
62
|
+
|
63
|
+
def initialize
|
64
|
+
@nested_tags = []
|
65
|
+
@x = {:items => []}
|
66
|
+
end
|
67
|
+
|
68
|
+
def result; @x; end
|
69
|
+
|
70
|
+
def tag_start(name, attrs)
|
71
|
+
@nested_tags.push name
|
72
|
+
case path
|
73
|
+
when 'feed/link'
|
74
|
+
@x[:link] = encode attrs['href']
|
75
|
+
when *ITEM_START_TAGS
|
76
|
+
@current_item = {}
|
77
|
+
when 'feed/entry/link'
|
78
|
+
@current_item[:link] = encode attrs['href']
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def tag_end(name)
|
83
|
+
case path
|
84
|
+
when *ITEM_START_TAGS
|
85
|
+
@x[:items] << @current_item
|
86
|
+
@current_item = nil
|
87
|
+
end
|
88
|
+
@nested_tags.pop
|
89
|
+
end
|
90
|
+
|
91
|
+
def text(text)
|
92
|
+
case path
|
93
|
+
when *FEED_TITLE_TAGS
|
94
|
+
@x[:title] = encode text.strip
|
95
|
+
when *FEED_LINK_TAGS
|
96
|
+
@x[:link] = encode text.strip
|
97
|
+
when *ITEM_TITLE_TAGS
|
98
|
+
@current_item[:title] = encode(text.strip)
|
99
|
+
when *ITEM_AUTHOR_TAGS
|
100
|
+
@current_item[:author] = encode(text.strip)
|
101
|
+
when *ITEM_GUID_TAGS
|
102
|
+
@current_item[:guid] = encode(text)
|
103
|
+
when *ITEM_PUB_DATE_TAGS
|
104
|
+
@current_item[:pub_date] = DateTime.parse(encode(text))
|
105
|
+
when *ITEM_LINK_TAGS
|
106
|
+
@current_item[:link] = encode(text)
|
107
|
+
when *ITEM_SUMMARY_TAGS
|
108
|
+
if @current_item[:summary]
|
109
|
+
@current_item[:summary] << encode(text)
|
110
|
+
else
|
111
|
+
@current_item[:summary] = encode(text)
|
112
|
+
end
|
113
|
+
when *ITEM_CONTENT_TAGS
|
114
|
+
if @current_item[:content]
|
115
|
+
@current_item[:content] << encode(text)
|
116
|
+
else
|
117
|
+
@current_item[:content] = encode(text)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
alias_method :cdata, :text
|
122
|
+
|
123
|
+
def xmldecl(decl, encoding, extra)
|
124
|
+
if encoding
|
125
|
+
@x[:orig_encoding] = encoding.downcase
|
126
|
+
else
|
127
|
+
@x[:orig_encoding] = "UTF-8"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
|
132
|
+
def path
|
133
|
+
@nested_tags.join('/')
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# encoding method
|
138
|
+
def encode(string)
|
139
|
+
string
|
140
|
+
end
|
141
|
+
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
if __FILE__ == $0
|
147
|
+
feeds = ARGV
|
148
|
+
feeds.each do |feed|
|
149
|
+
xml = File.read feed
|
150
|
+
f = Feedbase::FeedParser.new(xml)
|
151
|
+
pp f.result
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'feedbase/feed_parser'
|
2
|
+
require 'timeout'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
module Feedbase
|
6
|
+
class FetchFeed
|
7
|
+
|
8
|
+
attr_accessor :feed_url
|
9
|
+
|
10
|
+
def initialize(feed_url)
|
11
|
+
@feed_url = feed_url
|
12
|
+
end
|
13
|
+
|
14
|
+
def headers
|
15
|
+
if @headers
|
16
|
+
return @headers
|
17
|
+
end
|
18
|
+
_headers = begin
|
19
|
+
Timeout::timeout(20) do
|
20
|
+
agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3"
|
21
|
+
# get headers and any redirects
|
22
|
+
res = `curl -sIL -A'#{agent}' '#{feed_url}'`.gsub("\r\n", "\n")
|
23
|
+
if res !~ /^HTTP.*200 OK$/
|
24
|
+
puts res.inspect
|
25
|
+
raise "Response not OK"
|
26
|
+
end
|
27
|
+
res
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#TODO check for xml
|
32
|
+
@headers = { headers: _headers,
|
33
|
+
encoding: _headers[/^Content-Type:.*charset=(.*)$/i, 1],
|
34
|
+
etag: _headers[/^ETag: (.*)$/,1],
|
35
|
+
last_modified: ((x = _headers[/Last-Modified: (.*)/, 1]) && DateTime.parse(x)) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def fetch
|
39
|
+
url = fix_url(feed_url)
|
40
|
+
start_time = Time.now
|
41
|
+
result = begin
|
42
|
+
Timeout::timeout(20) do
|
43
|
+
agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3"
|
44
|
+
headers
|
45
|
+
# get headers and any redirects
|
46
|
+
`curl -sL -A'#{agent}' '#{url}'`
|
47
|
+
end
|
48
|
+
rescue StandardError, Timeout::Error => ex
|
49
|
+
raise
|
50
|
+
end
|
51
|
+
elapsed = Time.now - start_time
|
52
|
+
if !(x = headers[:headers].scan(/^Location: (.*)$/).flatten).empty?
|
53
|
+
#puts "Redirected to #{x.last}"
|
54
|
+
feed_url = x.last
|
55
|
+
end
|
56
|
+
result2 = Iconv.conv("UTF-8//TRANSLIT//IGNORE", (headers[:encoding] || 'iso-8859-1'), result)
|
57
|
+
f = FeedParser.new(result2).result
|
58
|
+
feed_params = {:feed_url => feed_url, :title => f[:title], :web_url => f[:link]}
|
59
|
+
items = f[:items]
|
60
|
+
|
61
|
+
{ feed_params: feed_params,
|
62
|
+
items: f[:items],
|
63
|
+
download_params: headers.merge(download_time: elapsed) }
|
64
|
+
end
|
65
|
+
|
66
|
+
def fix_url(url)
|
67
|
+
unless url =~ /^https?:\/\//
|
68
|
+
url = "http://" + url
|
69
|
+
end
|
70
|
+
url
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
if __FILE__ == $0
|
78
|
+
puts Feedbase::FetchFeed.new(ARGV.first).fetch
|
79
|
+
end
|
80
|
+
|
81
|
+
|
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env ruby19
|
2
|
+
$:.unshift(File.dirname(__FILE__) + "/../lib")
|
3
|
+
|
4
|
+
# Takes output of feed_file_generator.rb encoded in INPUT_ENCODING as input and
|
5
|
+
# strips superfluous markup from the feed item bodies.
|
6
|
+
|
7
|
+
#require 'feed_file_generator'
|
8
|
+
require 'fileutils'
|
9
|
+
require 'rexml/streamlistener'
|
10
|
+
require 'rexml/document'
|
11
|
+
require 'open3'
|
12
|
+
|
13
|
+
# NOTE requires the htmltidy program
|
14
|
+
# http://tidy.sourceforge.net/docs/Overview.html
|
15
|
+
|
16
|
+
INPUT_ENCODING = 'utf-8'
|
17
|
+
|
18
|
+
module Feedbase
|
19
|
+
class HtmlSimplifier
|
20
|
+
include FileUtils::Verbose
|
21
|
+
attr :result
|
22
|
+
|
23
|
+
# Takes feed data as hash. Generate this with FeedParser
|
24
|
+
def initialize(html, orig_encoding)
|
25
|
+
@orig_encoding = orig_encoding
|
26
|
+
@xml = tidy(pre_cleanup(html))
|
27
|
+
@result = parse.gsub(/<http[^>]+>/, "")
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse
|
31
|
+
@listener = FeedHtmlListener.new
|
32
|
+
REXML::Document.parse_stream(@xml, @listener)
|
33
|
+
@listener.result + "\n\n"
|
34
|
+
end
|
35
|
+
|
36
|
+
def pre_cleanup(html)
|
37
|
+
html.gsub!("<o:p></o:p>", "")
|
38
|
+
html
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.tidy(html, orig_encoding)
|
42
|
+
# assumes input encoding of latin 1
|
43
|
+
#output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr|
|
44
|
+
#output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
|
45
|
+
#output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1
|
46
|
+
|
47
|
+
tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null"
|
48
|
+
output = IO.popen(tidy, "r+") do |pipe|
|
49
|
+
input = <<-END
|
50
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
51
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
52
|
+
<head><title></title></head><body>#{html}</body></html>
|
53
|
+
END
|
54
|
+
pipe.puts input
|
55
|
+
pipe.close_write
|
56
|
+
#$stderr.puts stderr.read
|
57
|
+
pipe.read
|
58
|
+
end
|
59
|
+
output
|
60
|
+
end
|
61
|
+
|
62
|
+
def tidy(html)
|
63
|
+
self.class.tidy html, @orig_encoding
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
class FeedHtmlListener
|
69
|
+
include REXML::StreamListener
|
70
|
+
|
71
|
+
STRIP_TAGS = %w[ body font ]
|
72
|
+
BLOCK_TAGS = %w[ p div ]
|
73
|
+
HEADER_TAGS = %w[ h1 h2 h3 h4 h5 h6 ]
|
74
|
+
|
75
|
+
UNIFORM_HEADER_TAG = "h4"
|
76
|
+
|
77
|
+
def initialize
|
78
|
+
@nested_tags = []
|
79
|
+
@content = [""]
|
80
|
+
end
|
81
|
+
|
82
|
+
def result
|
83
|
+
# we call strip_empty_tags twice to catch empty tags nested in a tag like <p>
|
84
|
+
# not full-proof but good enough for now
|
85
|
+
x = @content.map {|line| strip_empty_tags( strip_empty_tags( line ).strip ) }.
|
86
|
+
select {|line| line != ""}.compact.join("\n\n")
|
87
|
+
end
|
88
|
+
|
89
|
+
def strip_empty_tags(line)
|
90
|
+
line.gsub(%r{<(\w+)[^>]*>\s*</\1>}, '')
|
91
|
+
end
|
92
|
+
|
93
|
+
def tag_start(name, attrs)
|
94
|
+
@nested_tags.push name
|
95
|
+
case name
|
96
|
+
when 'a'
|
97
|
+
# effectively strips out all style tags
|
98
|
+
@content[-1] << "<a href='#{attrs['href']}'>"
|
99
|
+
when 'img'
|
100
|
+
if attrs['alt']
|
101
|
+
text = (attrs['alt'].strip == '') ? 'image ' : "image:#{attrs['alt']} "
|
102
|
+
@content[-1] << text
|
103
|
+
end
|
104
|
+
when *HEADER_TAGS
|
105
|
+
@content << "<#{UNIFORM_HEADER_TAG}>"
|
106
|
+
when 'br' #skip
|
107
|
+
@content << "<br/>"
|
108
|
+
when 'blockquote'
|
109
|
+
@content << "<blockquote>"
|
110
|
+
when 'ul', 'ol', 'dl'
|
111
|
+
@content << "<#{name}>"
|
112
|
+
when 'li', 'dt', 'dd'
|
113
|
+
@content[-1] << " <#{name}>"
|
114
|
+
when 'strong', 'em'
|
115
|
+
@content[-1] << "<#{name}>"
|
116
|
+
when *BLOCK_TAGS
|
117
|
+
@content << "<p>"
|
118
|
+
when 'pre'
|
119
|
+
@content << "<pre>"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def tag_end(name)
|
124
|
+
@nested_tags.pop
|
125
|
+
case name
|
126
|
+
when 'a'
|
127
|
+
@content[-1] << "</a>"
|
128
|
+
when *HEADER_TAGS
|
129
|
+
@content[-1] << "</#{UNIFORM_HEADER_TAG}>"
|
130
|
+
when 'blockquote'
|
131
|
+
@content << '</blockquote>'
|
132
|
+
when 'ul', 'ol', 'dl'
|
133
|
+
@content[-1] << "</#{name}>"
|
134
|
+
when 'li', 'dt', 'dd'
|
135
|
+
@content[-1] << " </#{name}>"
|
136
|
+
when 'strong', 'em'
|
137
|
+
@content[-1] << "</#{name}>"
|
138
|
+
when *BLOCK_TAGS
|
139
|
+
@content[-1] << "</p>"
|
140
|
+
when 'pre'
|
141
|
+
@content[-1] << "</pre>"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def text(text)
|
146
|
+
return if text =~ /\a\s*\Z/
|
147
|
+
|
148
|
+
# probably slow, but ok for now
|
149
|
+
@content[-1] << text
|
150
|
+
end
|
151
|
+
|
152
|
+
def start_of_block?
|
153
|
+
BLOCK_TAGS.include? @nested_tags[-1]
|
154
|
+
end
|
155
|
+
|
156
|
+
def path
|
157
|
+
@nested_tags.join('/')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def word_count(string)
|
162
|
+
string.gsub(%{</?[^>]+>}, '').split(/\s+/).size
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
data/lib/feedbase.rb
ADDED
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feedbase
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Daniel Choi
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-07-31 00:00:00 -04:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: sequel
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
description: ""
|
28
|
+
email:
|
29
|
+
- dhchoi@gmail.com
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- NOTES
|
39
|
+
- README.markdown
|
40
|
+
- Rakefile
|
41
|
+
- db/create.sql
|
42
|
+
- db/setup.sh
|
43
|
+
- feedbase.gemspec
|
44
|
+
- lib/feedbase.rb
|
45
|
+
- lib/feedbase/feed.rb
|
46
|
+
- lib/feedbase/feed_parser.rb
|
47
|
+
- lib/feedbase/fetch_feed.rb
|
48
|
+
- lib/feedbase/html_simplifier.rb
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: ""
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 1.9.0
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: feedbase
|
73
|
+
rubygems_version: 1.6.1
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: A feed aggregator database
|
77
|
+
test_files: []
|
78
|
+
|