feed_yamlizer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/MIT-LICENSE.txt +21 -0
- data/README.markdown +7 -0
- data/bin/feed2yaml +31 -0
- data/feed_yamlizer.gemspec +24 -0
- data/lib/feed_yamlizer.rb +108 -0
- data/lib/feed_yamlizer/feed_listener.rb +102 -0
- data/lib/feed_yamlizer/feed_parser.rb +28 -0
- data/lib/feed_yamlizer/html_cleaner.rb +87 -0
- data/lib/feed_yamlizer/html_listener.rb +125 -0
- data/lib/feed_yamlizer/textifier.rb +16 -0
- data/lib/feed_yamlizer/version.rb +3 -0
- metadata +114 -0
data/.gitignore
ADDED
data/MIT-LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2011 Daniel Choi, http://danielchoi.com/software/
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
data/README.markdown
ADDED
data/bin/feed2yaml
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'feed_yamlizer'
|
5
|
+
rescue LoadError
|
6
|
+
require 'rubygems'
|
7
|
+
require 'feed_yamlizer'
|
8
|
+
end
|
9
|
+
require 'open-uri'
|
10
|
+
|
11
|
+
# for testing
|
12
|
+
def print_text(res)
|
13
|
+
res[:items].each {|x|
|
14
|
+
puts '-' * 30
|
15
|
+
puts x[:title]
|
16
|
+
puts
|
17
|
+
puts x[:content][:text]
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
result = if STDIN.tty?
|
22
|
+
FeedYamlizer.process_url ARGV.first
|
23
|
+
else
|
24
|
+
FeedYamlizer.process_xml STDIN.read
|
25
|
+
end
|
26
|
+
|
27
|
+
if ENV['TEST']
|
28
|
+
print_text result
|
29
|
+
else
|
30
|
+
puts result.to_yaml
|
31
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "feed_yamlizer/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "feed_yamlizer"
|
7
|
+
s.version = FeedYamlizer::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Daniel Choi"]
|
10
|
+
s.email = ["dhchoi@gmail.com"]
|
11
|
+
s.homepage = "https://github.com/danchoi/feed_yamlizer"
|
12
|
+
s.summary = %q{A feed parser and converter}
|
13
|
+
s.description = %q{Converts feeds to YAML and converts entries to plain text}
|
14
|
+
|
15
|
+
s.rubyforge_project = "feed_yamlizer"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
s.add_dependency 'nokogiri'
|
22
|
+
s.add_dependency 'htmlentities'
|
23
|
+
s.add_dependency 'sqlite3-ruby' # because htmlentities somehow requires this; a bug
|
24
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# Takes raw feed XML as input and generates a file with YAML and raw feed item
|
2
|
+
# bodies in a uniform "UTF-8".
|
3
|
+
|
4
|
+
# requires Ruby 1.9
|
5
|
+
|
6
|
+
require 'rexml/streamlistener'
|
7
|
+
require 'rexml/document'
|
8
|
+
require 'feed_yamlizer/feed_listener'
|
9
|
+
require 'feed_yamlizer/feed_parser'
|
10
|
+
require 'feed_yamlizer/html_listener'
|
11
|
+
require 'feed_yamlizer/html_cleaner'
|
12
|
+
require 'nokogiri'
|
13
|
+
require 'feed_yamlizer/textifier'
|
14
|
+
require 'fileutils'
|
15
|
+
require 'yaml'
|
16
|
+
require 'htmlentities'
|
17
|
+
|
18
|
+
class FeedYamlizer
|
19
|
+
include FileUtils::Verbose
|
20
|
+
|
21
|
+
def initialize(feed)
|
22
|
+
@feed = feed
|
23
|
+
@result = {:meta => {}, :items => []}
|
24
|
+
end
|
25
|
+
|
26
|
+
def result
|
27
|
+
add_feed_metaresult
|
28
|
+
add_items
|
29
|
+
@result
|
30
|
+
end
|
31
|
+
|
32
|
+
def add_feed_metaresult
|
33
|
+
fields = [:title, :link, :xml_encoding]
|
34
|
+
@result[:meta] = fields.reduce({}) {|memo, field|
|
35
|
+
memo[field] = @feed[field]
|
36
|
+
memo
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_items
|
41
|
+
@feed[:items].each_with_index {|item, i|
|
42
|
+
add_item_metaresult item, i
|
43
|
+
add_raw_content item, i
|
44
|
+
}
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_item_metaresult(item, index)
|
48
|
+
fields = [:title, :author, :guid, :pub_date, :link]
|
49
|
+
metaresult = fields.reduce({}) {|memo, field|
|
50
|
+
memo[field] = item[field]
|
51
|
+
memo
|
52
|
+
}
|
53
|
+
@result[:items] << metaresult
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_raw_content(item, index)
|
57
|
+
content = (item[:content] || item[:summary] || "").gsub(/^\s*/, '').strip
|
58
|
+
@result[:items][-1][:content] = {:html => content}
|
59
|
+
# TODO check if HTML or plain text!
|
60
|
+
simplified = HtmlCleaner.new(content).output
|
61
|
+
textified = Textifier.new(simplified).output
|
62
|
+
#@result[:items][-1][:content][:simplified] = simplified
|
63
|
+
@result[:items][-1][:content][:text] = textified
|
64
|
+
end
|
65
|
+
|
66
|
+
class << self
|
67
|
+
def xml_encoding(rawxml)
|
68
|
+
x = rawxml.scan(/encoding=["']([^"']+)["']/)
|
69
|
+
encoding = x && x[0] && x[0][0]
|
70
|
+
STDERR.puts "xml encoding: #{encoding.inspect}"
|
71
|
+
encoding
|
72
|
+
end
|
73
|
+
|
74
|
+
def to_utf(x, encoding = 'ISO-8859-1')
|
75
|
+
x = Iconv.conv("UTF-8//TRANSLIT//IGNORE", encoding, x)
|
76
|
+
end
|
77
|
+
|
78
|
+
def check_for_tidy
|
79
|
+
if `which tidy` == ''
|
80
|
+
abort "Please install tidy"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# main method
|
85
|
+
def run(feed_xml, encoding)
|
86
|
+
check_for_tidy
|
87
|
+
feed_xml = to_utf feed_xml, encoding
|
88
|
+
parsed_data = FeedYamlizer::FeedParser.new(feed_xml).result
|
89
|
+
result = FeedYamlizer.new(parsed_data).result
|
90
|
+
result
|
91
|
+
end
|
92
|
+
|
93
|
+
def process_xml(xml)
|
94
|
+
run xml, xml_encoding(xml)
|
95
|
+
end
|
96
|
+
|
97
|
+
def process_url(url)
|
98
|
+
response = open(url)
|
99
|
+
charset = response.charset
|
100
|
+
#STDERR.puts "charset: #{charset}"
|
101
|
+
xml = response.read
|
102
|
+
encoding = charset || xml_encoding(xml) || "ISO-8859-1"
|
103
|
+
run xml, encoding
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
@@ -0,0 +1,102 @@
|
|
1
|
+
class FeedYamlizer
|
2
|
+
class FeedListener
|
3
|
+
include REXML::StreamListener
|
4
|
+
|
5
|
+
FEED_TITLE_TAGS = %w[ feed/title rss/channel/title rdf:RDF/channel/title ]
|
6
|
+
|
7
|
+
FEED_LINK_TAGS = %w[ rss/channel/link rdf:RDF/channel/link ]
|
8
|
+
|
9
|
+
ITEM_START_TAGS = %w[ feed/entry rss/channel/item rdf:RDF/item ]
|
10
|
+
|
11
|
+
ITEM_TITLE_TAGS = %w[ feed/entry/title rss/channel/item/title rdf:RDF/item/title ]
|
12
|
+
|
13
|
+
ITEM_AUTHOR_TAGS = %w[ feed/entry/author/name rss/channel/item/author rdf:RDF/item/dc:creator ]
|
14
|
+
|
15
|
+
ITEM_GUID_TAGS = %w[ feed/entry/id rss/channel/item/guid rdf:RDF/item/guid ]
|
16
|
+
|
17
|
+
ITEM_PUB_DATE_TAGS = %w[ feed/entry/published feed/entry/created feed/entry/modified rss/channel/item/pubDate rdf:RDF/item/dc:date ]
|
18
|
+
|
19
|
+
ITEM_LINK_TAGS = %w[ rss/channel/item/link rdf:RDF/item/link ]
|
20
|
+
|
21
|
+
ITEM_SUMMARY_TAGS = %w[ feed/entry/summary rss/channel/item/description rdf:RDF/item/description ]
|
22
|
+
ITEM_CONTENT_TAGS = [ %r{feed/entry/content}, %r{rss/channel/item/content}, %r{rss/channel/item/content:encoded}, %r{rss/item/content}, %r{rdf:RDF/item/content} ]
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
@nested_tags = []
|
26
|
+
@x = {:items => []}
|
27
|
+
end
|
28
|
+
|
29
|
+
def result; @x; end
|
30
|
+
|
31
|
+
def tag_start(name, attrs)
|
32
|
+
@nested_tags.push name
|
33
|
+
case path
|
34
|
+
when 'feed/link'
|
35
|
+
@x[:link] = encode attrs['href']
|
36
|
+
when *ITEM_START_TAGS
|
37
|
+
@current_item = {}
|
38
|
+
when 'feed/entry/link'
|
39
|
+
@current_item[:link] = encode attrs['href']
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def tag_end(name)
|
44
|
+
case path
|
45
|
+
when *ITEM_START_TAGS
|
46
|
+
@x[:items] << @current_item
|
47
|
+
@current_item = nil
|
48
|
+
end
|
49
|
+
@nested_tags.pop
|
50
|
+
end
|
51
|
+
|
52
|
+
def text(text)
|
53
|
+
case path
|
54
|
+
when *FEED_TITLE_TAGS
|
55
|
+
@x[:title] = encode text.strip
|
56
|
+
when *FEED_LINK_TAGS
|
57
|
+
@x[:link] = encode text.strip
|
58
|
+
when *ITEM_TITLE_TAGS
|
59
|
+
@current_item[:title] = encode(text.strip)
|
60
|
+
when *ITEM_AUTHOR_TAGS
|
61
|
+
@current_item[:author] = encode(text.strip)
|
62
|
+
when *ITEM_GUID_TAGS
|
63
|
+
@current_item[:guid] = encode(text)
|
64
|
+
when *ITEM_PUB_DATE_TAGS
|
65
|
+
@current_item[:pub_date] = DateTime.parse(encode(text))
|
66
|
+
when *ITEM_LINK_TAGS
|
67
|
+
@current_item[:link] = encode(text)
|
68
|
+
when *ITEM_SUMMARY_TAGS
|
69
|
+
if @current_item[:summary]
|
70
|
+
@current_item[:summary] << encode(text)
|
71
|
+
else
|
72
|
+
@current_item[:summary] = encode(text)
|
73
|
+
end
|
74
|
+
when *ITEM_CONTENT_TAGS
|
75
|
+
if @current_item[:content]
|
76
|
+
@current_item[:content] << encode(text)
|
77
|
+
else
|
78
|
+
@current_item[:content] = encode(text)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
alias_method :cdata, :text
|
83
|
+
|
84
|
+
def xmldecl(decl, encoding, extra)
|
85
|
+
if encoding
|
86
|
+
@x[:xml_encoding] = encoding.downcase
|
87
|
+
else
|
88
|
+
@x[:xml_encoding] = "UTF-8"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def path
|
93
|
+
@nested_tags.join('/')
|
94
|
+
end
|
95
|
+
|
96
|
+
# encoding method
|
97
|
+
# TODO
|
98
|
+
def encode(string)
|
99
|
+
string
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# Custom feed parsing code by Daniel Choi dhchoi@gmail.com
|
2
|
+
# The goal is minimal dependencies (e.g. Feedzirra has too special dependencies).
|
3
|
+
|
4
|
+
# TODO
|
5
|
+
# come up with an encoding handling strategy
|
6
|
+
|
7
|
+
require 'iconv'
|
8
|
+
require 'yaml'
|
9
|
+
|
10
|
+
class FeedYamlizer
|
11
|
+
class FeedParser
|
12
|
+
def initialize(xml, encoding=nil)
|
13
|
+
@xml = xml
|
14
|
+
@listener = FeedListener.new
|
15
|
+
REXML::Document.parse_stream(@xml, @listener)
|
16
|
+
# TODO this is a hack, do it right
|
17
|
+
rescue REXML::ParseException
|
18
|
+
#puts "REXML::ParseException; converting xml to ascii"
|
19
|
+
@xml = Iconv.conv("US-ASCII//TRANSLIT//IGNORE", "ISO-8859-1", @xml)
|
20
|
+
REXML::Document.parse_stream(@xml, @listener)
|
21
|
+
end
|
22
|
+
|
23
|
+
def result
|
24
|
+
@listener.result
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Takes output of feed_file_generator.rb encoded in UTF-8 as input and
|
2
|
+
# strips superfluous markup from the feed item bodies.
|
3
|
+
|
4
|
+
#require 'feed_file_generator'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'rexml/streamlistener'
|
7
|
+
require 'rexml/document'
|
8
|
+
require 'open3'
|
9
|
+
|
10
|
+
# NOTE requires the htmltidy program
|
11
|
+
# http://tidy.sourceforge.net/docs/Overview.html
|
12
|
+
|
13
|
+
class FeedYamlizer
|
14
|
+
class HtmlCleaner
|
15
|
+
include FileUtils::Verbose
|
16
|
+
|
17
|
+
# Takes feed data as hash. Generate this with FeedParser
|
18
|
+
def initialize(html)
|
19
|
+
@html = html
|
20
|
+
decode_entities
|
21
|
+
@xml = self.class.tidy(@html)
|
22
|
+
@result = parse.gsub(/<http[^>]+>/, "")
|
23
|
+
end
|
24
|
+
|
25
|
+
def output
|
26
|
+
@result
|
27
|
+
end
|
28
|
+
|
29
|
+
def parse
|
30
|
+
@listener = HtmlListener.new
|
31
|
+
REXML::Document.parse_stream(@xml, @listener)
|
32
|
+
@listener.result + "\n\n"
|
33
|
+
end
|
34
|
+
|
35
|
+
def decode_entities
|
36
|
+
coder = HTMLEntities.new
|
37
|
+
coder.decode @html
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.tidy(html)
|
41
|
+
# assumes input encoding of latin 1
|
42
|
+
#output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr|
|
43
|
+
#output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
|
44
|
+
#output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1
|
45
|
+
|
46
|
+
tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null"
|
47
|
+
output = IO.popen(tidy, "r+") do |pipe|
|
48
|
+
input = <<-END
|
49
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
50
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
51
|
+
<head><title></title></head><body>#{html}</body></html>
|
52
|
+
END
|
53
|
+
pipe.puts input
|
54
|
+
pipe.close_write
|
55
|
+
#$stderr.puts stderr.read
|
56
|
+
pipe.read
|
57
|
+
end
|
58
|
+
output
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def word_count(string)
|
64
|
+
string.gsub(%{</?[^>]+>}, '').split(/\s+/).size
|
65
|
+
end
|
66
|
+
|
67
|
+
# all this is deprecated
|
68
|
+
if __FILE__ == $0
|
69
|
+
# The input file is assumed to be in UTF-8
|
70
|
+
feed_file = STDIN.read
|
71
|
+
|
72
|
+
feed_file.force_encoding UTF-8
|
73
|
+
segments = feed_file.split(/^-{20}$/)
|
74
|
+
feed_meta = segments.shift
|
75
|
+
orig_encoding = YAML::load(feed_meta)[:orig_encoding]
|
76
|
+
|
77
|
+
new_segs = segments.map do |s|
|
78
|
+
meta, body = s.split(/^\s*$/, 2)
|
79
|
+
new_body = HtmlSimplifier.new(body, orig_encoding).result.strip + "\n\n"
|
80
|
+
meta = meta + ":word_count: #{ word_count(new_body) }\n"
|
81
|
+
[meta, new_body].join("\n")
|
82
|
+
end
|
83
|
+
result = ([feed_meta] + new_segs).join( '-' * 20 )
|
84
|
+
STDOUT.puts result
|
85
|
+
end
|
86
|
+
|
87
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
class FeedYamlizer
|
2
|
+
class HtmlListener
|
3
|
+
include REXML::StreamListener
|
4
|
+
|
5
|
+
STRIP_TAGS = %w[ body font ]
|
6
|
+
BLOCK_TAGS = %w[ p div ]
|
7
|
+
HEADER_TAGS = %w[ h1 h2 h3 h4 h5 h6 ]
|
8
|
+
|
9
|
+
UNIFORM_HEADER_TAG = "h4"
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@nested_tags = []
|
13
|
+
@content = [""]
|
14
|
+
@links = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def result
|
18
|
+
# we call strip_empty_tags twice to catch empty tags nested in a tag like <p>
|
19
|
+
# not full-proof but good enough for now
|
20
|
+
x = @content.map {|line| strip_empty_tags( strip_empty_tags( line ).strip ) }.
|
21
|
+
select {|line| line.strip != ""}.
|
22
|
+
compact.
|
23
|
+
join("\n\n")
|
24
|
+
|
25
|
+
digits = @links.size.to_s.size
|
26
|
+
|
27
|
+
x = format(x)
|
28
|
+
|
29
|
+
x + "\n\n" + @links.map {|x|
|
30
|
+
gutter = x[:index].to_s.rjust(digits)
|
31
|
+
if x[:content] && x[:content].strip.length > 0
|
32
|
+
%Q|#{gutter}. "#{x[:content].gsub(/[\r\n]+/, ' ').strip}"\n#{' ' * (digits + 2)}#{x[:href]}|
|
33
|
+
else
|
34
|
+
"#{gutter}. #{x[:href]}"
|
35
|
+
end
|
36
|
+
}.join("\n")
|
37
|
+
end
|
38
|
+
|
39
|
+
def strip_empty_tags(line)
|
40
|
+
line.gsub(%r{<(\w+)[^>]*>\s*</\1>}, '')
|
41
|
+
end
|
42
|
+
|
43
|
+
def tag_start(name, attrs)
|
44
|
+
@nested_tags.push name
|
45
|
+
case name
|
46
|
+
when 'a'
|
47
|
+
@links << {:href => attrs['href']}
|
48
|
+
@in_link = true
|
49
|
+
when 'img'
|
50
|
+
text = attrs['alt'] || attrs['title']
|
51
|
+
chunk = ['img', text].join(':')
|
52
|
+
@content[-1] << chunk
|
53
|
+
when *HEADER_TAGS
|
54
|
+
@content << "<#{UNIFORM_HEADER_TAG}>"
|
55
|
+
when 'br' #skip
|
56
|
+
#@content << "<br/>"
|
57
|
+
@content << ""
|
58
|
+
when 'blockquote'
|
59
|
+
@content << "[blockquote]"
|
60
|
+
when 'ul', 'ol', 'dl'
|
61
|
+
@content << "<#{name}>"
|
62
|
+
when 'li', 'dt', 'dd'
|
63
|
+
@content[-1] << " <#{name}>"
|
64
|
+
when 'strong', 'em'
|
65
|
+
@content[-1] << "<#{name}>"
|
66
|
+
when *BLOCK_TAGS
|
67
|
+
@content << "<p>"
|
68
|
+
when 'pre'
|
69
|
+
@content << "<pre>"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def tag_end(name)
|
74
|
+
@nested_tags.pop
|
75
|
+
case name
|
76
|
+
when 'a'
|
77
|
+
@links[-1][:index] = @links.size
|
78
|
+
@in_link = false
|
79
|
+
@content[-1] << "#{(@links[-1][:content] || '').strip.gsub(/[\r\n]+/, ' ')}[#{@links.size}]"
|
80
|
+
when *HEADER_TAGS
|
81
|
+
@content[-1] << "</#{UNIFORM_HEADER_TAG}>"
|
82
|
+
when 'blockquote'
|
83
|
+
@content << '[/blockquote]'
|
84
|
+
when 'ul', 'ol', 'dl'
|
85
|
+
@content[-1] << "</#{name}>"
|
86
|
+
when 'li', 'dt', 'dd'
|
87
|
+
@content[-1] << " </#{name}>"
|
88
|
+
when 'strong', 'em'
|
89
|
+
@content[-1] << "</#{name}>"
|
90
|
+
when *BLOCK_TAGS
|
91
|
+
@content[-1] << "</p>"
|
92
|
+
when 'pre'
|
93
|
+
@content[-1] << "</pre>"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def text(text)
|
98
|
+
return if text =~ /\a\s*\Z/
|
99
|
+
if @in_link
|
100
|
+
(@links[-1][:content] ||= "") << text
|
101
|
+
return
|
102
|
+
end
|
103
|
+
|
104
|
+
# probably slow, but ok for now
|
105
|
+
@content[-1] << text
|
106
|
+
end
|
107
|
+
|
108
|
+
def start_of_block?
|
109
|
+
BLOCK_TAGS.include? @nested_tags[-1]
|
110
|
+
end
|
111
|
+
|
112
|
+
def path
|
113
|
+
@nested_tags.join('/')
|
114
|
+
end
|
115
|
+
|
116
|
+
def format(x)
|
117
|
+
IO.popen("fmt", "r+") do |pipe|
|
118
|
+
pipe.puts x
|
119
|
+
pipe.close_write
|
120
|
+
pipe.read
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# just takes simplified HTML and converts it to plain text
|
2
|
+
class FeedYamlizer
|
3
|
+
class Textifier
|
4
|
+
def initialize(html)
|
5
|
+
@doc = Nokogiri::HTML.parse(html)
|
6
|
+
end
|
7
|
+
|
8
|
+
# TODO beef this up with real effects
|
9
|
+
|
10
|
+
def output
|
11
|
+
@doc.inner_text
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feed_yamlizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Daniel Choi
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-01-13 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: nokogiri
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: htmlentities
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 0
|
43
|
+
version: "0"
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: sqlite3-ruby
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
type: :runtime
|
58
|
+
version_requirements: *id003
|
59
|
+
description: Converts feeds to YAML and converts entries to plain text
|
60
|
+
email:
|
61
|
+
- dhchoi@gmail.com
|
62
|
+
executables:
|
63
|
+
- feed2yaml
|
64
|
+
extensions: []
|
65
|
+
|
66
|
+
extra_rdoc_files: []
|
67
|
+
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- MIT-LICENSE.txt
|
71
|
+
- README.markdown
|
72
|
+
- bin/feed2yaml
|
73
|
+
- feed_yamlizer.gemspec
|
74
|
+
- lib/feed_yamlizer.rb
|
75
|
+
- lib/feed_yamlizer/feed_listener.rb
|
76
|
+
- lib/feed_yamlizer/feed_parser.rb
|
77
|
+
- lib/feed_yamlizer/html_cleaner.rb
|
78
|
+
- lib/feed_yamlizer/html_listener.rb
|
79
|
+
- lib/feed_yamlizer/textifier.rb
|
80
|
+
- lib/feed_yamlizer/version.rb
|
81
|
+
has_rdoc: true
|
82
|
+
homepage: https://github.com/danchoi/feed_yamlizer
|
83
|
+
licenses: []
|
84
|
+
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
segments:
|
96
|
+
- 0
|
97
|
+
version: "0"
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
segments:
|
104
|
+
- 0
|
105
|
+
version: "0"
|
106
|
+
requirements: []
|
107
|
+
|
108
|
+
rubyforge_project: feed_yamlizer
|
109
|
+
rubygems_version: 1.3.7
|
110
|
+
signing_key:
|
111
|
+
specification_version: 3
|
112
|
+
summary: A feed parser and converter
|
113
|
+
test_files: []
|
114
|
+
|