RubyGems - feed_yamlizer - Versions diffs - 0.0.1 - Mend

feed_yamlizer 0.0.1

Files changed (13) hide show

data/.gitignore +8 -0
data/MIT-LICENSE.txt +21 -0
data/README.markdown +7 -0
data/bin/feed2yaml +31 -0
data/feed_yamlizer.gemspec +24 -0
data/lib/feed_yamlizer.rb +108 -0
data/lib/feed_yamlizer/feed_listener.rb +102 -0
data/lib/feed_yamlizer/feed_parser.rb +28 -0
data/lib/feed_yamlizer/html_cleaner.rb +87 -0
data/lib/feed_yamlizer/html_listener.rb +125 -0
data/lib/feed_yamlizer/textifier.rb +16 -0
data/lib/feed_yamlizer/version.rb +3 -0
metadata +114 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,8 @@
+.DS_Store
+*swp
+login.yml
+gmail.yml
+*.log
+pkg/
+.rvmrc
+notes.txt

data/MIT-LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+Copyright (c) 2011 Daniel Choi, http://danielchoi.com/software/
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.markdown ADDED Viewed

@@ -0,0 +1,7 @@
+# feed_yamlizer
+feed_yamlizer converts feeds into Ruby hashes and also processes feed entries
+into plain text.
+More usage intructions to come.

data/bin/feed2yaml ADDED Viewed

@@ -0,0 +1,31 @@
+#!/usr/bin/env ruby
+begin
+  require 'feed_yamlizer'
+rescue LoadError
+  require 'rubygems'
+  require 'feed_yamlizer'
+end
+require 'open-uri'
+# for testing
+def print_text(res)
+  res[:items].each {|x|
+    puts '-' * 30
+    puts x[:title]
+    puts
+    puts x[:content][:text]
+  }
+end
+result = if STDIN.tty?
+           FeedYamlizer.process_url ARGV.first
+         else
+           FeedYamlizer.process_xml STDIN.read
+         end
+if ENV['TEST']
+  print_text result
+else
+  puts result.to_yaml
+end

data/feed_yamlizer.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "feed_yamlizer/version"
+Gem::Specification.new do |s|
+  s.name        = "feed_yamlizer"
+  s.version     = FeedYamlizer::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Daniel Choi"]
+  s.email       = ["dhchoi@gmail.com"]
+  s.homepage    = "https://github.com/danchoi/feed_yamlizer"
+  s.summary     = %q{A feed parser and converter}
+  s.description = %q{Converts feeds to YAML and converts entries to plain text}
+  s.rubyforge_project = "feed_yamlizer"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_dependency 'nokogiri'
+  s.add_dependency 'htmlentities'
+  s.add_dependency 'sqlite3-ruby' # because htmlentities somehow requires this; a bug
+end

data/lib/feed_yamlizer.rb ADDED Viewed

@@ -0,0 +1,108 @@
+# Takes raw feed XML as input and generates a file with YAML and raw feed item
+# bodies in a uniform "UTF-8".
+# requires Ruby 1.9
+require 'rexml/streamlistener'
+require 'rexml/document'
+require 'feed_yamlizer/feed_listener'
+require 'feed_yamlizer/feed_parser'
+require 'feed_yamlizer/html_listener'
+require 'feed_yamlizer/html_cleaner'
+require 'nokogiri'
+require 'feed_yamlizer/textifier'
+require 'fileutils'
+require 'yaml'
+require 'htmlentities'
+class FeedYamlizer
+  include FileUtils::Verbose
+  def initialize(feed)
+    @feed = feed
+    @result = {:meta => {}, :items => []}
+  end
+  def result
+    add_feed_metaresult
+    add_items
+    @result
+  end
+  def add_feed_metaresult
+    fields = [:title, :link, :xml_encoding]
+    @result[:meta] = fields.reduce({}) {|memo, field|
+      memo[field] = @feed[field]
+      memo
+    }
+  end
+  def add_items
+    @feed[:items].each_with_index {|item, i|
+      add_item_metaresult item, i
+      add_raw_content item, i
+    }
+  end
+  def add_item_metaresult(item, index)
+    fields = [:title, :author, :guid, :pub_date, :link]
+    metaresult = fields.reduce({}) {|memo, field|
+      memo[field] = item[field]
+      memo
+    }
+    @result[:items] << metaresult
+  end
+  def add_raw_content(item, index)
+    content = (item[:content] || item[:summary] || "").gsub(/^\s*/, '').strip
+    @result[:items][-1][:content] = {:html => content}
+    # TODO check if HTML or plain text!
+    simplified = HtmlCleaner.new(content).output
+    textified = Textifier.new(simplified).output
+    #@result[:items][-1][:content][:simplified] = simplified
+    @result[:items][-1][:content][:text] = textified
+  end
+  class << self
+    def xml_encoding(rawxml)
+      x = rawxml.scan(/encoding=["']([^"']+)["']/)
+      encoding = x && x[0] && x[0][0]
+      STDERR.puts "xml encoding: #{encoding.inspect}"
+      encoding
+    end
+    def to_utf(x, encoding = 'ISO-8859-1')
+      x = Iconv.conv("UTF-8//TRANSLIT//IGNORE", encoding, x)
+    end
+    def check_for_tidy
+      if `which tidy` == ''
+        abort "Please install tidy"
+      end
+    end
+    # main method
+    def run(feed_xml, encoding)
+      check_for_tidy
+      feed_xml = to_utf feed_xml, encoding
+      parsed_data = FeedYamlizer::FeedParser.new(feed_xml).result
+      result = FeedYamlizer.new(parsed_data).result
+      result
+    end
+    def process_xml(xml)
+      run xml, xml_encoding(xml)
+    end
+    def process_url(url)
+      response = open(url)
+      charset = response.charset
+      #STDERR.puts "charset: #{charset}"
+      xml = response.read
+      encoding = charset || xml_encoding(xml) || "ISO-8859-1"
+      run xml, encoding
+    end
+  end
+end

data/lib/feed_yamlizer/feed_listener.rb ADDED Viewed

@@ -0,0 +1,102 @@
+class FeedYamlizer
+  class FeedListener
+    include REXML::StreamListener
+    FEED_TITLE_TAGS = %w[ feed/title rss/channel/title rdf:RDF/channel/title ]
+    FEED_LINK_TAGS = %w[ rss/channel/link rdf:RDF/channel/link ]
+    ITEM_START_TAGS = %w[ feed/entry rss/channel/item rdf:RDF/item ]
+    ITEM_TITLE_TAGS = %w[ feed/entry/title rss/channel/item/title rdf:RDF/item/title ]
+    ITEM_AUTHOR_TAGS = %w[ feed/entry/author/name rss/channel/item/author rdf:RDF/item/dc:creator ]
+    ITEM_GUID_TAGS = %w[ feed/entry/id rss/channel/item/guid rdf:RDF/item/guid ]
+    ITEM_PUB_DATE_TAGS = %w[ feed/entry/published feed/entry/created feed/entry/modified rss/channel/item/pubDate rdf:RDF/item/dc:date ]
+    ITEM_LINK_TAGS = %w[ rss/channel/item/link rdf:RDF/item/link ]
+    ITEM_SUMMARY_TAGS = %w[ feed/entry/summary rss/channel/item/description rdf:RDF/item/description ]
+    ITEM_CONTENT_TAGS = [ %r{feed/entry/content}, %r{rss/channel/item/content}, %r{rss/channel/item/content:encoded},  %r{rss/item/content}, %r{rdf:RDF/item/content} ]
+    def initialize
+      @nested_tags = []
+      @x = {:items => []}
+    end
+    def result; @x; end
+    def tag_start(name, attrs)
+      @nested_tags.push name
+      case path
+      when 'feed/link'
+        @x[:link] = encode attrs['href']
+      when *ITEM_START_TAGS
+        @current_item = {}
+      when 'feed/entry/link'
+        @current_item[:link] = encode attrs['href']
+      end
+    end
+    def tag_end(name)
+      case path
+      when *ITEM_START_TAGS
+        @x[:items] << @current_item
+        @current_item = nil
+      end
+      @nested_tags.pop
+    end
+    def text(text)
+      case path
+      when *FEED_TITLE_TAGS
+        @x[:title] = encode text.strip
+      when *FEED_LINK_TAGS
+        @x[:link] = encode text.strip
+      when *ITEM_TITLE_TAGS
+        @current_item[:title] = encode(text.strip)
+      when *ITEM_AUTHOR_TAGS
+        @current_item[:author] = encode(text.strip)
+      when *ITEM_GUID_TAGS
+        @current_item[:guid] = encode(text)
+      when *ITEM_PUB_DATE_TAGS
+        @current_item[:pub_date] = DateTime.parse(encode(text))
+      when *ITEM_LINK_TAGS
+        @current_item[:link] = encode(text)
+      when *ITEM_SUMMARY_TAGS
+        if @current_item[:summary]
+          @current_item[:summary] << encode(text)
+        else
+          @current_item[:summary] = encode(text)
+        end
+      when *ITEM_CONTENT_TAGS
+        if @current_item[:content]
+          @current_item[:content]  << encode(text)
+        else
+          @current_item[:content] = encode(text)
+        end
+      end
+    end
+    alias_method :cdata, :text
+    def xmldecl(decl, encoding, extra)
+      if encoding
+        @x[:xml_encoding] = encoding.downcase
+      else
+        @x[:xml_encoding] = "UTF-8"
+      end
+    end
+    def path
+      @nested_tags.join('/')
+    end
+    # encoding method
+    # TODO
+    def encode(string)
+      string
+    end
+  end
+end

data/lib/feed_yamlizer/feed_parser.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# Custom feed parsing code by Daniel Choi dhchoi@gmail.com
+# The goal is minimal dependencies (e.g. Feedzirra has too special dependencies).
+# TODO
+# come up with an encoding handling strategy
+require 'iconv'
+require 'yaml'
+class FeedYamlizer
+  class FeedParser
+    def initialize(xml, encoding=nil)
+      @xml = xml
+      @listener = FeedListener.new
+      REXML::Document.parse_stream(@xml, @listener)
+    # TODO this is a hack, do it right
+    rescue REXML::ParseException
+      #puts "REXML::ParseException; converting xml to ascii"
+      @xml = Iconv.conv("US-ASCII//TRANSLIT//IGNORE", "ISO-8859-1", @xml)
+      REXML::Document.parse_stream(@xml, @listener)
+    end
+    def result
+      @listener.result
+    end
+  end
+end

data/lib/feed_yamlizer/html_cleaner.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# Takes output of feed_file_generator.rb encoded in UTF-8 as input and
+# strips superfluous markup from the feed item bodies.
+#require 'feed_file_generator'
+require 'fileutils'
+require 'rexml/streamlistener'
+require 'rexml/document'
+require 'open3'
+# NOTE requires the htmltidy program
+# http://tidy.sourceforge.net/docs/Overview.html
+class FeedYamlizer
+  class HtmlCleaner
+    include FileUtils::Verbose
+    # Takes feed data as hash. Generate this with FeedParser
+    def initialize(html)
+      @html = html
+      decode_entities
+      @xml = self.class.tidy(@html)
+      @result = parse.gsub(/<http[^>]+>/, "")
+    end
+    def output
+      @result
+    end
+    def parse
+      @listener = HtmlListener.new
+      REXML::Document.parse_stream(@xml, @listener)
+      @listener.result + "\n\n"
+    end
+    def decode_entities
+      coder = HTMLEntities.new
+      coder.decode @html
+    end
+    def self.tidy(html)
+      # assumes input encoding of latin 1
+      #output = Open3.popen3("tidy -q -n -wrap 120 -asxml -latin1") do |stdin, stdout, stderr|
+      #output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
+      #output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1
+      tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null"
+      output = IO.popen(tidy, "r+") do |pipe|
+        input = <<-END
+  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+  <html xmlns="http://www.w3.org/1999/xhtml">
+  <head><title></title></head><body>#{html}</body></html>
+        END
+        pipe.puts input
+        pipe.close_write
+        #$stderr.puts stderr.read
+        pipe.read
+      end
+      output
+    end
+  end
+end
+def word_count(string)
+  string.gsub(%{</?[^>]+>}, '').split(/\s+/).size
+end
+# all this is deprecated
+if __FILE__ == $0
+  # The input file is assumed to be in UTF-8
+  feed_file = STDIN.read
+  feed_file.force_encoding UTF-8
+  segments = feed_file.split(/^-{20}$/)
+  feed_meta = segments.shift
+  orig_encoding = YAML::load(feed_meta)[:orig_encoding]
+  new_segs = segments.map do |s|
+    meta, body = s.split(/^\s*$/, 2)
+    new_body = HtmlSimplifier.new(body, orig_encoding).result.strip + "\n\n"
+    meta = meta + ":word_count: #{ word_count(new_body) }\n"
+    [meta, new_body].join("\n")
+  end
+  result = ([feed_meta] + new_segs).join( '-' * 20  )
+  STDOUT.puts result
+end

data/lib/feed_yamlizer/html_listener.rb ADDED Viewed

@@ -0,0 +1,125 @@
+class FeedYamlizer
+  class HtmlListener
+    include REXML::StreamListener
+    STRIP_TAGS = %w[ body font ]
+    BLOCK_TAGS = %w[ p div ]
+    HEADER_TAGS =  %w[ h1 h2 h3 h4 h5 h6 ]
+    UNIFORM_HEADER_TAG = "h4"
+    def initialize
+      @nested_tags = []
+      @content = [""]
+      @links = []
+    end
+    def result
+      # we call strip_empty_tags twice to catch empty tags nested in a tag like <p>
+      # not full-proof but good enough for now
+      x = @content.map {|line| strip_empty_tags( strip_empty_tags( line ).strip ) }.
+        select {|line| line.strip != ""}.
+        compact.
+        join("\n\n")
+      digits = @links.size.to_s.size
+      x = format(x)
+      x + "\n\n" + @links.map {|x|
+        gutter = x[:index].to_s.rjust(digits)
+        if x[:content] && x[:content].strip.length > 0
+          %Q|#{gutter}. "#{x[:content].gsub(/[\r\n]+/, ' ').strip}"\n#{' ' * (digits + 2)}#{x[:href]}|
+        else
+          "#{gutter}. #{x[:href]}"
+        end
+      }.join("\n")
+    end
+    def strip_empty_tags(line)
+      line.gsub(%r{<(\w+)[^>]*>\s*</\1>}, '')
+    end
+    def tag_start(name, attrs)
+      @nested_tags.push name
+      case name
+      when 'a'
+        @links << {:href => attrs['href']}
+        @in_link = true
+      when 'img'
+        text = attrs['alt'] || attrs['title']
+        chunk = ['img', text].join(':')
+        @content[-1] << chunk
+      when *HEADER_TAGS
+        @content << "<#{UNIFORM_HEADER_TAG}>"
+      when 'br' #skip
+        #@content << "<br/>"
+        @content << ""
+      when 'blockquote'
+        @content << "[blockquote]"
+      when 'ul', 'ol', 'dl'
+        @content << "<#{name}>"
+      when 'li', 'dt', 'dd'
+        @content[-1] << "  <#{name}>"
+      when 'strong', 'em'
+        @content[-1] << "<#{name}>"
+      when *BLOCK_TAGS
+        @content << "<p>"
+      when 'pre'
+        @content << "<pre>"
+      end
+    end
+    def tag_end(name)
+      @nested_tags.pop
+      case name
+      when 'a'
+        @links[-1][:index] = @links.size
+        @in_link = false
+        @content[-1] << "#{(@links[-1][:content] || '').strip.gsub(/[\r\n]+/, ' ')}[#{@links.size}]"
+      when *HEADER_TAGS
+        @content[-1] << "</#{UNIFORM_HEADER_TAG}>"
+      when 'blockquote'
+        @content << '[/blockquote]'
+      when 'ul', 'ol', 'dl'
+        @content[-1] << "</#{name}>"
+      when 'li', 'dt', 'dd'
+        @content[-1] << "  </#{name}>"
+      when 'strong', 'em'
+        @content[-1] << "</#{name}>"
+      when *BLOCK_TAGS
+        @content[-1] << "</p>"
+      when 'pre'
+        @content[-1] << "</pre>"
+      end
+    end
+    def text(text)
+      return if text =~ /\a\s*\Z/
+      if @in_link
+        (@links[-1][:content] ||= "") << text
+        return
+      end
+      # probably slow, but ok for now
+      @content[-1] << text
+    end
+    def start_of_block?
+      BLOCK_TAGS.include? @nested_tags[-1]
+    end
+    def path
+      @nested_tags.join('/')
+    end
+    def format(x)
+      IO.popen("fmt", "r+") do |pipe|
+        pipe.puts x
+        pipe.close_write
+        pipe.read
+      end
+    end
+  end
+end

data/lib/feed_yamlizer/textifier.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# just takes simplified HTML and converts it to plain text
+class FeedYamlizer
+  class Textifier
+    def initialize(html)
+      @doc = Nokogiri::HTML.parse(html)
+    end
+    # TODO beef this up with real effects
+    def output
+      @doc.inner_text
+    end
+  end
+end

data/lib/feed_yamlizer/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class FeedYamlizer
+  VERSION = "0.0.1"
+end

metadata ADDED Viewed

@@ -0,0 +1,114 @@
+--- !ruby/object:Gem::Specification
+name: feed_yamlizer
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+  - 0
+  - 0
+  - 1
+  version: 0.0.1
+platform: ruby
+authors:
+- Daniel Choi
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-01-13 00:00:00 -05:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: htmlentities
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: sqlite3-ruby
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id003
+description: Converts feeds to YAML and converts entries to plain text
+email:
+- dhchoi@gmail.com
+executables:
+- feed2yaml
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- MIT-LICENSE.txt
+- README.markdown
+- bin/feed2yaml
+- feed_yamlizer.gemspec
+- lib/feed_yamlizer.rb
+- lib/feed_yamlizer/feed_listener.rb
+- lib/feed_yamlizer/feed_parser.rb
+- lib/feed_yamlizer/html_cleaner.rb
+- lib/feed_yamlizer/html_listener.rb
+- lib/feed_yamlizer/textifier.rb
+- lib/feed_yamlizer/version.rb
+has_rdoc: true
+homepage: https://github.com/danchoi/feed_yamlizer
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project: feed_yamlizer
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: A feed parser and converter
+test_files: []