RubyGems - Html2Feedbooks - Versions diffs - 1.1.1 → 1.3.1 - Mend

Html2Feedbooks 1.1.1 → 1.3.1

Files changed (26) hide show

data/.gitignore +4 -0
data/Gemfile +4 -0
data/Rakefile +1 -0
data/bin/html2fb.rb +4 -7
data/html2fb.gemspec +27 -0
data/lib/{app.rb → html2fb/app.rb} +8 -9
data/lib/{conf.rb → html2fb/conf.rb} +0 -0
data/lib/{document.rb → html2fb/document.rb} +0 -0
data/lib/{downloader.rb → html2fb/downloader.rb} +0 -0
data/lib/{feedbooks.rb → html2fb/feedbooks.rb} +5 -6
data/lib/{parser.rb → html2fb/parser.rb} +89 -49
data/lib/html2fb/version.rb +3 -0
data/lib/html2fb.rb +7 -0
data/samples/107-h.htm +19642 -0
data/samples/107-h2.htm +1259 -0
data/samples/3049-h.htm +7787 -0
data/samples/3058-h.htm +8732 -0
data/samples/3258-h.htm +19894 -0
data/samples/3258-h2.htm +686 -0
data/samples/3469-h.htm +14024 -0
data/samples/conf107-h.yml +27 -0
data/samples/conf3049-h.yml +27 -0
data/samples/conf3058-h.yml +27 -0
data/samples/conf3258-h.yml +26 -0
data/samples/conf3469-h.yml +32 -0
metadata +41 -21

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in html2fb.gemspec
+gemspec

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/bin/html2fb.rb CHANGED Viewed

@@ -1,14 +1,11 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
+require 'rubygems'
 require 'optparse'
 require 'open-uri'
-require 'conf.rb'
-require 'downloader.rb'
-require 'document.rb'
-require 'parser.rb'
-require 'feedbooks.rb'
 require 'tmpdir'
 require 'launchy'
 require 'digest/md5'
+require 'lib/html2fb'
 include HTML2FB
@@ -86,7 +83,7 @@ if options[:preview]
 	puts "A preview of the parsed file should be opening in your webbrowser now"
 	puts "If nothing open you can open the file located at : #{page}"
 	puts "When happy with the parsed output rerun with -s option to send to Feedbooks.com"
-	Launchy::Browser.run(page)
+	Launchy.open(page)
 else
 	doc.to_feedbooks(conf)
 end

data/html2fb.gemspec ADDED Viewed

@@ -0,0 +1,27 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "html2fb/version"
+Gem::Specification.new do |s|
+  s.name        = "Html2Feedbooks"
+  s.version     = Html2fb::VERSION
+  s.authors = ["Benoit Larroque"]
+  s.email = ["benoit dot larroque at feedbooks dot com"]
+  s.summary = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
+  s.homepage = %q{http://github.com/zetaben/Html2Feedbooks}
+  s.description = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.default_executable = 'html2fb.rb'
+  s.add_dependency('nokogiri','>=1.4.1')
+  s.add_dependency('htmlentities', '>= 4.2.1')
+  s.add_dependency('launchy', '>= 2.0.0')
+  s.add_dependency('progressbar', '>= 0.0.3')
+  # specify any dependencies here; for example:
+  # s.add_development_dependency "rspec"
+  # s.add_runtime_dependency "rest-client"
+end

data/lib/{app.rb → html2fb/app.rb} RENAMED Viewed

@@ -34,8 +34,8 @@ class AtomPost
 			req = Net::HTTP::Get.new(url.path)
 			req.basic_auth user,pass  unless user.nil?
 			response = http.request(req)
-			doc=Hpricot(response.body)
-			e=doc.at('//entry').at('link[@rel="down"]')
+			doc=Nokogiri::XML(response.body).remove_namespaces!
+			e=doc.at('//entry/link[@rel="down"]')
 			return 	URI.parse(e[:href]).path unless e.nil?
 		}
 	end
@@ -47,7 +47,6 @@ class AtomPost
 		#STDERR.puts "sending to #{url}"
 		req = Net::HTTP::Post.new(url.path)
 		req.basic_auth user,pass  unless user.nil?
 		req.body  = '<?xml version="1.0"?>'+"\n"
 		req.body  +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
 		req.body  +='<title>'+decode_text(title)+'</title>'+"\n"
@@ -77,9 +76,9 @@ class AtomPost
 	def recode_text(txt)
 		return txt if txt.blank?
-		m=Hpricot(txt)
-		m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
-		m.to_html
+		m=Nokogiri::XML("<text>#{txt}</text>")
+		m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
+		m.root.inner_html
 	end
 	HTMLENCODER=HTMLEntities.new
 	def force_decimal_entities(txt)
@@ -88,9 +87,9 @@ class AtomPost
 	def decode_text(txt)
 		return txt if txt.blank?
-		m=Hpricot(txt)
-		m.traverse_text{|t| HTMLENCODER.decode(t.content)}
-		m.to_html
+		m=Nokogiri::XML("<text>#{txt}</text>")
+		m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
+		m.root.inner_html
 	end
 end

data/lib/{conf.rb → html2fb/conf.rb} RENAMED Viewed

File without changes

data/lib/{document.rb → html2fb/document.rb} RENAMED Viewed

File without changes

data/lib/{downloader.rb → html2fb/downloader.rb} RENAMED Viewed

File without changes

data/lib/{feedbooks.rb → html2fb/feedbooks.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
-require 'app.rb'
-require 'hpricot'
+require 'html2fb/app.rb'
+require 'nokogiri'
 require 'digest/md5'
 module HTML2FB
@@ -105,11 +105,10 @@ module HTML2FB
 		def to_feedbooks(conf,path=nil)
 			stxt=to_html
 			return unless stxt.strip.size > 0
-			doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
-			doc.traverse_all_element do |e|
-				unless e.is_a?Hpricot::Text
+			doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
+			doc.traverse do |e|
+				if e.element?
 					e.name='xhtml:'+e.name
-					e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
 				end
 			end
 			FBPost.push(conf,'',doc.to_html,"Text",path)

data/lib/{parser.rb → html2fb/parser.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
-require 'hpricot'
-require 'document.rb'
+require 'nokogiri'
+require 'html2fb/document.rb'
 require 'progressbar'
 #require 'ruby-prof'
 #require 'term/ansicolor'
@@ -14,11 +14,11 @@ module HTML2FB
 		def parse(txt)
 			puts "Parsing HTML"
-			pdoc=Hpricot(txt)
+			pdoc=Nokogiri::HTML(txt)
 			if @conf['conv']
 				mc=pdoc/'meta[@http-equiv="Content-Type"]'
 				if mc.size>0
-					charset=mc.first.attributes['content'].split(';').find do |s|
+					charset=mc.first.attributes['content'].to_s.split(';').find do |s|
 						s.strip[0,7]=='charset'
 					end
 					unless charset.nil?
@@ -28,7 +28,7 @@ module HTML2FB
 					unless tc.nil?
 						puts "Trying to convert source encoding from #{tc} to utf-8"
 						require 'iconv'
-						pdoc=Hpricot(Iconv.conv('utf-8',tc.downcase,txt))
+						pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
 					end
@@ -38,7 +38,7 @@ module HTML2FB
 			puts "Removing garbage elements"
 			remove_objs(pdoc)
 			ti=pdoc.at('title')
-			doc.title= ti.extract_text.strip unless ti.nil?
+			doc.title= ti.text.strip unless ti.nil?
 			#			pdoc.search('//h3').each do |e|
 			#				doc.content.push(e.inner_text)
 			#			end
@@ -58,10 +58,10 @@ module HTML2FB
 					doc.search('.'+cl).remove
 				end unless @conf['remove']['class'].nil?
 				@conf['remove']['expr'].each do |cl|
-					doc.search(cl).remove
+					doc.search(cl).remove rescue doc.xpath(cl).remove
 				end unless @conf['remove']['expr'].nil?
 				@conf['remove']['before'].each do |cl|
-					x=doc.at(cl)
+					x=doc.at(cl) rescue doc.at_xpath(cl)
 					if x
 						x.preceding.remove
 						x.parent.children.delete(x)
@@ -73,7 +73,7 @@ module HTML2FB
 					t.remove unless t.nil?
 				end unless @conf['remove']['between'].nil?
 				@conf['remove']['after'].each do |cl|
-					x=doc.at(cl)
+					x=doc.at(cl) rescue doc.at_xpath(cl)
 					if x
 						x.following.remove
 						x.parent.children.delete(x)
@@ -89,13 +89,13 @@ module HTML2FB
 			aut=build_autom(@conf['select'],ret)
-			pbar = ProgressBar.new("Parsing", doc.search('//').size)
-			doc.traverse_all_element do |el|
+			pbar = ProgressBar.new("Parsing", doc.search('//*').size)
+			doc.traverse do |el|
 				aut.feed(el)
 				pbar.inc
 			end
-			pbar.finish
 			aut.finish(doc)
+			pbar.finish
 =begin
 			 result = RubyProf.stop
 			  printer = RubyProf::FlatPrinter.new(result)
@@ -180,10 +180,10 @@ module HTML2FB
 			if @content=='body'
 				tmp=el.preceding[0..-1]
 			else
-				tmp=el.root.search(@content...(el.xpath))[1..-1]
+				tmp=el.root.between(@content,(el.path),true)[1..-1]
 			end
 			if tmp.blank? #search can'find between siblins
-				tmp=el.root.deep_between(@content,(el.xpath))
+				tmp=el.root.deep_between(@content,(el.path))
 			end
 			unless tmp.blank?
 				tmph=tmp.to_html
@@ -195,7 +195,7 @@ module HTML2FB
 			(lvl..@max_level).to_a.reverse.each do |l|
 				close_section(l)
 			end
-			@starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
+			@starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
 			@content=obj[:xpath]
 			@current_level=lvl
 		end
@@ -209,7 +209,7 @@ module HTML2FB
 		end
 		def feed(el)
-			return if el.is_a?Hpricot::Text
+			return if el.text?
 			@done=[[]*@levels.size]
 			@levels.each_with_index do  |lvl,i|
@@ -218,7 +218,7 @@ module HTML2FB
 					if el.in_search?(expr['expr'])
-						open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
+						open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
 						break
 					end
 				end
@@ -228,6 +228,9 @@ module HTML2FB
 	end
 end
+class  Nokogiri::XML::NodeSet
+	alias :blank? :empty?
+end
 class String
 	def blank?
@@ -241,17 +244,22 @@ class NilClass
 	end
 end
-module Hpricot::Traverse
+class Nokogiri::XML::Node
 	def in_search?(expr)
 		if expr !~ /[^a-z0-9]/
 			return self.name.downcase()==expr.downcase()
 		end
-		se_in=self.parent
+		se_in=self.root
+		se_in=self.parent if self.respond_to?(:parent)
 		if expr[0..1]=='/'
 			se_in=self.root
 		end
-		se_in.search(expr).each do |el|
+		set=se_in.search(expr) rescue se_in.xpath(expr)
+		set.each do |el|
 			return true if el==self
 		end
 		#		puts self.name+" "+expr
@@ -259,34 +267,60 @@ module Hpricot::Traverse
 	end
 	def root
-		return @root unless @root.nil?
-		se_in=self
-		se_in=se_in.parent until se_in.parent.nil?
-		@root=se_in
-		se_in
+		self.document.root
 	end
-	def between(a,b)
-		root.search(a..b)
+	def node_position
+		return @node_position if @node_position
+		@node_position=parent.children.index(self)
 	end
-	def extract_text
-		t=''
-		self.traverse_all_element do |e|
-			t+=e.content.to_s if e.is_a?(Hpricot::Text)
+	def between(a,b,excl=false)
+		#from nokogiri
+		offset=(excl ? -1 : 0)
+		ary = []
+		ele1=at(a) rescue at_xpath(a)
+		ele2=at(b) rescue at_xpath(b)
+		if ele1 and ele2
+			# let's quickly take care of siblings
+			if ele1.parent == ele2.parent
+				ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
+			else
+				# find common parent
+				ele1_p=ele1.ancestors
+				ele2_p=ele2.ancestors
+				common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
+				child = nil
+				if ele1 == common_parent
+					child = ele2
+				elsif ele2 == common_parent
+					child = ele1
+				end
+				if child
+					ary = common_parent.children[0..(child.node_position+offset)]
+				end
+			end
 		end
-		t
+		return Nokogiri::XML::NodeSet.new(ele1.document,ary)
 	end
-	def deep_between(i,j)
-		unless j.nil? || self.at(j).nil?
-			tm=self.at(i)
+	def deep_between(i,j)
+		unless j.nil? || self.at_xpath(j).nil?
+			tm=self.at_xpath(i)
 			prec=tm.deep_preceding
-			r=Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}]
+			r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
 		else
 			r=self.at(i).deep_following unless self.at(i).nil?
 		end
-		Hpricot::Elements[*select_end(r,i)]
+		Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
 	end
 	def select_end(tab,expr)
@@ -296,13 +330,15 @@ module Hpricot::Traverse
 		idx=-1
 		i=0
 		tab.each do |e|
-			if e.search(expr.gsub(e.xpath,'.')).size > 0
+			nxp=expr.gsub(e.path,'.')
+			set=e.search(nxp) rescue e.xpath(nxp)
+			if set.size > 0
 				idx=i
 				#if e.search(i).size > 0
-				if e.children.find{|ee| ee.xpath==expr }
+				if e.children.find{|ee| ee.path==expr }
 					e.children.each do |ee|
 						s << ee if f
-						f=true if ee.xpath==expr
+						f=true if ee.path==expr
 					end
 				else
 					s=select_end(e.children,expr)
@@ -316,20 +352,24 @@ module Hpricot::Traverse
 		return s+tab[(idx+1)..-1]
 	end
+	def preceding
+		self.parent.children[0...node_position]
+	end
+	def following
+		self.parent.children[node_position+1..-1]
+	end
 	def deep_preceding()
-		ret=Hpricot::Elements[]
-		ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
+		ret=Nokogiri::XML::NodeSet.new(self.document,[])
+		ret+=parent.deep_preceding if respond_to?(:parent)  && !parent.is_a?(Nokogiri::XML::Document)
 		ret+=preceding
-		Hpricot::Elements[*ret]
+		ret
 	end
 	def deep_following()
 		ret=following
-		ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
-		Hpricot::Elements[*ret]
+		ret+=parent.deep_following if respond_to?(:parent)  && !parent.is_a?(Nokogiri::XML::Document)
+		ret
 	end
 end
-class Hpricot::Elements
-	alias_method :blank?, :empty?
-end

data/lib/html2fb/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Html2fb
+  VERSION = "1.3.1"
+end

data/lib/html2fb.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require "html2fb/version"
+require 'html2fb/conf.rb'
+require 'html2fb/downloader.rb'
+require 'html2fb/document.rb'
+require 'html2fb/parser.rb'
+require 'html2fb/feedbooks.rb'