RubyGems - Html2Feedbooks - Versions diffs - 1.1.1 → 1.3.1 - Mend

Html2Feedbooks 1.1.1 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/.gitignore +4 -0
data/Gemfile +4 -0
data/Rakefile +1 -0
data/bin/html2fb.rb +4 -7
data/html2fb.gemspec +27 -0
data/lib/{app.rb → html2fb/app.rb} +8 -9
data/lib/{conf.rb → html2fb/conf.rb} +0 -0
data/lib/{document.rb → html2fb/document.rb} +0 -0
data/lib/{downloader.rb → html2fb/downloader.rb} +0 -0
data/lib/{feedbooks.rb → html2fb/feedbooks.rb} +5 -6
data/lib/{parser.rb → html2fb/parser.rb} +89 -49
data/lib/html2fb/version.rb +3 -0
data/lib/html2fb.rb +7 -0
data/samples/107-h.htm +19642 -0
data/samples/107-h2.htm +1259 -0
data/samples/3049-h.htm +7787 -0
data/samples/3058-h.htm +8732 -0
data/samples/3258-h.htm +19894 -0
data/samples/3258-h2.htm +686 -0
data/samples/3469-h.htm +14024 -0
data/samples/conf107-h.yml +27 -0
data/samples/conf3049-h.yml +27 -0
data/samples/conf3058-h.yml +27 -0
data/samples/conf3258-h.yml +26 -0
data/samples/conf3469-h.yml +32 -0
metadata +41 -21

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in html2fb.gemspec
+gemspec

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/bin/html2fb.rb CHANGED Viewed

@@ -1,14 +1,11 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
+require 'rubygems'
 require 'optparse'
 require 'open-uri'
-require 'conf.rb'
-require 'downloader.rb'
-require 'document.rb'
-require 'parser.rb'
-require 'feedbooks.rb'
 require 'tmpdir'
 require 'launchy'
 require 'digest/md5'
+require 'lib/html2fb'
 include HTML2FB
@@ -86,7 +83,7 @@ if options[:preview]
 	puts "A preview of the parsed file should be opening in your webbrowser now"
 	puts "If nothing open you can open the file located at : #{page}"
 	puts "When happy with the parsed output rerun with -s option to send to Feedbooks.com"
-	Launchy::Browser.run(page)
+	Launchy.open(page)
 else
 	doc.to_feedbooks(conf)
 end

data/html2fb.gemspec ADDED Viewed

@@ -0,0 +1,27 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "html2fb/version"
+Gem::Specification.new do |s|
+  s.name        = "Html2Feedbooks"
+  s.version     = Html2fb::VERSION
+  s.authors = ["Benoit Larroque"]
+  s.email = ["benoit dot larroque at feedbooks dot com"]
+  s.summary = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
+  s.homepage = %q{http://github.com/zetaben/Html2Feedbooks}
+  s.description = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.default_executable = 'html2fb.rb'
+  s.add_dependency('nokogiri','>=1.4.1')
+  s.add_dependency('htmlentities', '>= 4.2.1')
+  s.add_dependency('launchy', '>= 2.0.0')
+  s.add_dependency('progressbar', '>= 0.0.3')
+  # specify any dependencies here; for example:
+  # s.add_development_dependency "rspec"
+  # s.add_runtime_dependency "rest-client"
+end

data/lib/{app.rb → html2fb/app.rb} RENAMED Viewed

@@ -34,8 +34,8 @@ class AtomPost
 			req = Net::HTTP::Get.new(url.path)
 			req.basic_auth user,pass  unless user.nil?
 			response = http.request(req)
-			doc=Hpricot(response.body)
-			e=doc.at('//entry').at('link[@rel="down"]')
+			doc=Nokogiri::XML(response.body).remove_namespaces!
+			e=doc.at('//entry/link[@rel="down"]')
 			return 	URI.parse(e[:href]).path unless e.nil?
 		}
 	end
@@ -47,7 +47,6 @@ class AtomPost
 		#STDERR.puts "sending to #{url}"
 		req = Net::HTTP::Post.new(url.path)
 		req.basic_auth user,pass  unless user.nil?
 		req.body  = '<?xml version="1.0"?>'+"\n"
 		req.body  +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
 		req.body  +='<title>'+decode_text(title)+'</title>'+"\n"
@@ -77,9 +76,9 @@ class AtomPost
 	def recode_text(txt)
 		return txt if txt.blank?
-		m=Hpricot(txt)
-		m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
-		m.to_html
+		m=Nokogiri::XML("<text>#{txt}</text>")
+		m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
+		m.root.inner_html
 	end
 	HTMLENCODER=HTMLEntities.new
 	def force_decimal_entities(txt)
@@ -88,9 +87,9 @@ class AtomPost
 	def decode_text(txt)
 		return txt if txt.blank?
-		m=Hpricot(txt)
-		m.traverse_text{|t| HTMLENCODER.decode(t.content)}
-		m.to_html
+		m=Nokogiri::XML("<text>#{txt}</text>")
+		m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
+		m.root.inner_html
 	end
 end

data/lib/{conf.rb → html2fb/conf.rb} RENAMED Viewed

File without changes

data/lib/{document.rb → html2fb/document.rb} RENAMED Viewed

File without changes

data/lib/{downloader.rb → html2fb/downloader.rb} RENAMED Viewed

File without changes

data/lib/{feedbooks.rb → html2fb/feedbooks.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
-require 'app.rb'
-require 'hpricot'
+require 'html2fb/app.rb'
+require 'nokogiri'
 require 'digest/md5'
 module HTML2FB
@@ -105,11 +105,10 @@ module HTML2FB
 		def to_feedbooks(conf,path=nil)
 			stxt=to_html
 			return unless stxt.strip.size > 0
-			doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
-			doc.traverse_all_element do |e|
-				unless e.is_a?Hpricot::Text
+			doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
+			doc.traverse do |e|
+				if e.element?
 					e.name='xhtml:'+e.name
-					e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
 				end
 			end
 			FBPost.push(conf,'',doc.to_html,"Text",path)

data/lib/{parser.rb → html2fb/parser.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
-require 'hpricot'
-require 'document.rb'
+require 'nokogiri'
+require 'html2fb/document.rb'
 require 'progressbar'
 #require 'ruby-prof'
 #require 'term/ansicolor'
@@ -14,11 +14,11 @@ module HTML2FB
 		def parse(txt)
 			puts "Parsing HTML"
-			pdoc=Hpricot(txt)
+			pdoc=Nokogiri::HTML(txt)
 			if @conf['conv']
 				mc=pdoc/'meta[@http-equiv="Content-Type"]'
 				if mc.size>0
-					charset=mc.first.attributes['content'].split(';').find do |s|
+					charset=mc.first.attributes['content'].to_s.split(';').find do |s|
 						s.strip[0,7]=='charset'
 					end
 					unless charset.nil?
@@ -28,7 +28,7 @@ module HTML2FB
 					unless tc.nil?
 						puts "Trying to convert source encoding from #{tc} to utf-8"
 						require 'iconv'
-						pdoc=Hpricot(Iconv.conv('utf-8',tc.downcase,txt))
+						pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
 					end
@@ -38,7 +38,7 @@ module HTML2FB
 			puts "Removing garbage elements"
 			remove_objs(pdoc)
 			ti=pdoc.at('title')
-			doc.title= ti.extract_text.strip unless ti.nil?
+			doc.title= ti.text.strip unless ti.nil?
 			#			pdoc.search('//h3').each do |e|
 			#				doc.content.push(e.inner_text)
 			#			end
@@ -58,10 +58,10 @@ module HTML2FB
 					doc.search('.'+cl).remove
 				end unless @conf['remove']['class'].nil?
 				@conf['remove']['expr'].each do |cl|
-					doc.search(cl).remove
+					doc.search(cl).remove rescue doc.xpath(cl).remove
 				end unless @conf['remove']['expr'].nil?
 				@conf['remove']['before'].each do |cl|
-					x=doc.at(cl)
+					x=doc.at(cl) rescue doc.at_xpath(cl)
 					if x
 						x.preceding.remove
 						x.parent.children.delete(x)
@@ -73,7 +73,7 @@ module HTML2FB
 					t.remove unless t.nil?
 				end unless @conf['remove']['between'].nil?
 				@conf['remove']['after'].each do |cl|
-					x=doc.at(cl)
+					x=doc.at(cl) rescue doc.at_xpath(cl)
 					if x
 						x.following.remove
 						x.parent.children.delete(x)
@@ -89,13 +89,13 @@ module HTML2FB
 			aut=build_autom(@conf['select'],ret)
-			pbar = ProgressBar.new("Parsing", doc.search('//').size)
-			doc.traverse_all_element do |el|
+			pbar = ProgressBar.new("Parsing", doc.search('//*').size)
+			doc.traverse do |el|
 				aut.feed(el)
 				pbar.inc
 			end
-			pbar.finish
 			aut.finish(doc)
+			pbar.finish
 =begin
 			 result = RubyProf.stop
 			  printer = RubyProf::FlatPrinter.new(result)
@@ -180,10 +180,10 @@ module HTML2FB
 			if @content=='body'
 				tmp=el.preceding[0..-1]
 			else
-				tmp=el.root.search(@content...(el.xpath))[1..-1]
+				tmp=el.root.between(@content,(el.path),true)[1..-1]
 			end
 			if tmp.blank? #search can'find between siblins
-				tmp=el.root.deep_between(@content,(el.xpath))
+				tmp=el.root.deep_between(@content,(el.path))
 			end
 			unless tmp.blank?
 				tmph=tmp.to_html
@@ -195,7 +195,7 @@ module HTML2FB
 			(lvl..@max_level).to_a.reverse.each do |l|
 				close_section(l)
 			end
-			@starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
+			@starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
 			@content=obj[:xpath]
 			@current_level=lvl
 		end
@@ -209,7 +209,7 @@ module HTML2FB
 		end
 		def feed(el)
-			return if el.is_a?Hpricot::Text
+			return if el.text?
 			@done=[[]*@levels.size]
 			@levels.each_with_index do  |lvl,i|
@@ -218,7 +218,7 @@ module HTML2FB
 					if el.in_search?(expr['expr'])
-						open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
+						open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
 						break
 					end
 				end
@@ -228,6 +228,9 @@ module HTML2FB
 	end
 end
+class  Nokogiri::XML::NodeSet
+	alias :blank? :empty?
+end
 class String
 	def blank?
@@ -241,17 +244,22 @@ class NilClass
 	end
 end
-module Hpricot::Traverse
+class Nokogiri::XML::Node
 	def in_search?(expr)
 		if expr !~ /[^a-z0-9]/
 			return self.name.downcase()==expr.downcase()
 		end
-		se_in=self.parent
+		se_in=self.root
+		se_in=self.parent if self.respond_to?(:parent)
 		if expr[0..1]=='/'
 			se_in=self.root
 		end
-		se_in.search(expr).each do |el|
+		set=se_in.search(expr) rescue se_in.xpath(expr)
+		set.each do |el|
 			return true if el==self
 		end
 		#		puts self.name+" "+expr
@@ -259,34 +267,60 @@ module Hpricot::Traverse
 	end
 	def root
-		return @root unless @root.nil?
-		se_in=self
-		se_in=se_in.parent until se_in.parent.nil?
-		@root=se_in
-		se_in
+		self.document.root
 	end
-	def between(a,b)
-		root.search(a..b)
+	def node_position
+		return @node_position if @node_position
+		@node_position=parent.children.index(self)
 	end
-	def extract_text
-		t=''
-		self.traverse_all_element do |e|
-			t+=e.content.to_s if e.is_a?(Hpricot::Text)
+	def between(a,b,excl=false)
+		#from nokogiri
+		offset=(excl ? -1 : 0)
+		ary = []
+		ele1=at(a) rescue at_xpath(a)
+		ele2=at(b) rescue at_xpath(b)
+		if ele1 and ele2
+			# let's quickly take care of siblings
+			if ele1.parent == ele2.parent
+				ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
+			else
+				# find common parent
+				ele1_p=ele1.ancestors
+				ele2_p=ele2.ancestors
+				common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
+				child = nil
+				if ele1 == common_parent
+					child = ele2
+				elsif ele2 == common_parent
+					child = ele1
+				end
+				if child
+					ary = common_parent.children[0..(child.node_position+offset)]
+				end
+			end
 		end
-		t
+		return Nokogiri::XML::NodeSet.new(ele1.document,ary)
 	end
-	def deep_between(i,j)
-		unless j.nil? || self.at(j).nil?
-			tm=self.at(i)
+	def deep_between(i,j)
+		unless j.nil? || self.at_xpath(j).nil?
+			tm=self.at_xpath(i)
 			prec=tm.deep_preceding
-			r=Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}]
+			r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
 		else
 			r=self.at(i).deep_following unless self.at(i).nil?
 		end
-		Hpricot::Elements[*select_end(r,i)]
+		Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
 	end
 	def select_end(tab,expr)
@@ -296,13 +330,15 @@ module Hpricot::Traverse
 		idx=-1
 		i=0
 		tab.each do |e|
-			if e.search(expr.gsub(e.xpath,'.')).size > 0
+			nxp=expr.gsub(e.path,'.')
+			set=e.search(nxp) rescue e.xpath(nxp)
+			if set.size > 0
 				idx=i
 				#if e.search(i).size > 0
-				if e.children.find{|ee| ee.xpath==expr }
+				if e.children.find{|ee| ee.path==expr }
 					e.children.each do |ee|
 						s << ee if f
-						f=true if ee.xpath==expr
+						f=true if ee.path==expr
 					end
 				else
 					s=select_end(e.children,expr)
@@ -316,20 +352,24 @@ module Hpricot::Traverse
 		return s+tab[(idx+1)..-1]
 	end
+	def preceding
+		self.parent.children[0...node_position]
+	end
+	def following
+		self.parent.children[node_position+1..-1]
+	end
 	def deep_preceding()
-		ret=Hpricot::Elements[]
-		ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
+		ret=Nokogiri::XML::NodeSet.new(self.document,[])
+		ret+=parent.deep_preceding if respond_to?(:parent)  && !parent.is_a?(Nokogiri::XML::Document)
 		ret+=preceding
-		Hpricot::Elements[*ret]
+		ret
 	end
 	def deep_following()
 		ret=following
-		ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
-		Hpricot::Elements[*ret]
+		ret+=parent.deep_following if respond_to?(:parent)  && !parent.is_a?(Nokogiri::XML::Document)
+		ret
 	end
 end
-class Hpricot::Elements
-	alias_method :blank?, :empty?
-end

data/lib/html2fb/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Html2fb
+  VERSION = "1.3.1"
+end

data/lib/html2fb.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require "html2fb/version"
+require 'html2fb/conf.rb'
+require 'html2fb/downloader.rb'
+require 'html2fb/document.rb'
+require 'html2fb/parser.rb'
+require 'html2fb/feedbooks.rb'