RubyGems - zetaben-Html2Feedbooks - Versions diffs - 0.1 → 0.2 - Mend

zetaben-Html2Feedbooks 0.1 → 0.2

Files changed (7) hide show

data/README CHANGED Viewed

@@ -8,4 +8,6 @@ Usage
 ./html2fb URL.html
-You can change settings in confs/conf.yaml
+You can change some settings by creating your own configuration file and using
+html2fb -c myconf.yaml URL.html

data/bin/html2fb.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/ruby
+require 'optparse'
 require 'open-uri'
 require 'conf.rb'
 require 'downloader.rb'
@@ -8,6 +9,20 @@ require 'feedbooks.rb'
 include HTML2FB
+options = {}
+options[:conf] = "conf.yaml"
+options[:preview] = true
+OptionParser.new do |opts|
+	opts.banner = "Usage: html2fb [options] URL"
+	opts.on("-c", "--conf FILE", String,"Configuration file") do |f|
+		options[:conf] = f
+	end
+	opts.on("-s", "-s","Send to feedbooks") do |f|
+		options[:preview] = !f
+	end
+end.parse!
 valid=false
 entry=ARGV[0]
 while !valid
@@ -23,9 +38,16 @@ while !valid
 	print "URL : " if entry.nil? || entry==''
 	entry=STDIN.readline.strip unless valid
 end
-conf=Conf.new('conf.yaml')
+conf=Conf.new(options[:conf])
 content=Downloader.download(url)
 #puts content.size
 doc=Parser.new(conf).parse(content)
 puts doc.toc.to_yaml
+if options[:preview]
+	f=File.open('/tmp/plop.html','w')
+	f.write doc.to_html
+	f.close
+	`firefox /tmp/plop.html`
+else
 doc.to_feedbooks(conf)
+end

data/lib/app.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require 'digest/md5'
 require 'open-uri'
 require 'net/http'
 require 'time'
+require 'htmlentities'
 class AtomPost
 	attr_accessor :title
@@ -25,11 +26,11 @@ class AtomPost
 		req.body  = '<?xml version="1.0"?>'+"\n"
 		req.body  +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
-		req.body  +='<title>'+title+'</title>'+"\n"
+		req.body  +='<title>'+recode_text(title)+'</title>'+"\n"
 		req.body  +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
 		req.body  +='<updated>'+date.xmlschema+'</updated>'+"\n"
 		req.body  +='<author><name>'+author+'</name></author>'+"\n"
-		req.body  +='<content>'+content+'</content>'+"\n"
+		req.body  +='<content>'+recode_text(content)+'</content>'+"\n"
 		req.body  +='</entry>'+"\n"
 		req.set_content_type('application/atom+xml;type=entry')
@@ -46,4 +47,15 @@ class AtomPost
 			res.error!
 		end
 	end
+	def recode_text(txt)
+		return txt if txt.blank?
+		m=Hpricot(txt)
+		m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
+		m.to_html
+	end
+	HTMLENCODER=HTMLEntities.new
+	def force_decimal_entities(txt)
+		HTMLENCODER.encode(HTMLENCODER.decode(txt),:decimal)
+	end
 end

data/lib/conf.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module HTML2FB
 	class Conf
 		def initialize(file)
-			['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
+			['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
 				f=p+file
 				begin
 					if File.readable?(f) && File.exists?(f)

data/lib/feedbooks.rb CHANGED Viewed

@@ -54,6 +54,7 @@ module HTML2FB
 	class Section
 		@@level=0
 		def to_feedbooks(conf)
+			puts "Sending to feedbooks"
 			fb=FBSession.session
 			post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
 			doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')

data/lib/parser.rb CHANGED Viewed

@@ -8,16 +8,27 @@ module HTML2FB
 			@conf=conf
 		end
+		def extract_text(n)
+			t=''
+			n.traverse_all_element do |e|
+				t+=e.content.to_s if e.is_a?(Hpricot::Text)
+			end
+			t
+		end
 		def parse(txt)
+			puts "Parsing HTML"
 			pdoc=Hpricot(txt)
 			doc=Document.new
+			puts "Removing garbage elements"
 			remove_objs(pdoc)
 			ti=pdoc.at('title')
-			doc.title= ti.inner_text.strip unless ti.nil?
+			doc.title= extract_text(ti).strip unless ti.nil?
 			#			pdoc.search('//h3').each do |e|
 			#				doc.content.push(e.inner_text)
 			#			end
+			puts "Building TOC"
 			parse_text(pdoc,doc)
 			return doc
@@ -34,8 +45,10 @@ module HTML2FB
 				end unless @conf['remove']['expr'].nil?
 				@conf['remove']['before'].each do |cl|
 					x=doc.at(cl)
+					if x
 					x.preceding.remove
 					x.parent.children.delete(x)
+					end
 				end unless @conf['remove']['before'].nil?
 				@conf['remove']['between'].each do |cl|
 #					puts "between "+cl.inspect
@@ -43,11 +56,13 @@ module HTML2FB
 				end unless @conf['remove']['between'].nil?
 				@conf['remove']['after'].each do |cl|
 					x=doc.at(cl)
+					if x
 					x.following.remove
 					x.parent.children.delete(x)
+					end
 				end unless @conf['remove']['after'].nil?
 			end
-			File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
+#			File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
 		end
 		def parse_text(doc,ret)
@@ -59,7 +74,8 @@ module HTML2FB
 				tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
 				tmp.sub!(a.first.to_original_html,'')
 				s.content =[Text.new(tmp)]
-				s.title = a.first.inner_text.to_s
+				#buggy with entities
+				s.title = extract_text(a.first)
 				ret.content.push s
 			end
@@ -88,7 +104,7 @@ module HTML2FB
 							s=Section.new
 							tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
 							s.content = [Text.new(tmp)]
-							s.title = a.first.inner_text.to_s
+							s.title = extract_text(a.first)
 							el.content.push s
 							l.content.sub!(tmp,'')
 							l.content.sub!(a.first.to_original_html,'')
@@ -117,13 +133,25 @@ end
 module Hpricot::Traverse
 	def between(i,j)
 		#puts i,j
-		unless j.nil?
-			prec=self.at(i).preceding
-			Hpricot::Elements[*self.at(j).preceding.find_all{|el| !prec.include?el}]
+		unless j.nil? || self.at(j).nil?
+			prec=self.at(i).deep_preceding
+			Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
 		else
-			self.at(i).following
+			self.at(i).deep_following unless self.at(i).nil?
 		end
 	end
+	def deep_preceding()
+	ret=Hpricot::Elements[]
+	ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
+	ret+=preceding
+	Hpricot::Elements[*ret]
+	end
+	def deep_following()
+	ret=following
+	ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
+	Hpricot::Elements[*ret]
+	end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: zetaben-Html2Feedbooks
 version: !ruby/object:Gem::Version
-  version: "0.1"
+  version: "0.2"
 platform: ruby
 authors:
 - Benoit Larroque
@@ -22,6 +22,16 @@ dependencies:
       - !ruby/object:Gem::Version
         version: "0.6"
     version:
+- !ruby/object:Gem::Dependency
+  name: htmlentities
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "4.0"
+    version:
 description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
 email: zeta dot ben at gmail dot com
 executables: