zetaben-Html2Feedbooks 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +3 -1
 - data/bin/html2fb.rb +23 -1
 - data/lib/app.rb +14 -2
 - data/lib/conf.rb +1 -1
 - data/lib/feedbooks.rb +1 -0
 - data/lib/parser.rb +36 -8
 - metadata +11 -1
 
    
        data/README
    CHANGED
    
    
    
        data/bin/html2fb.rb
    CHANGED
    
    | 
         @@ -1,4 +1,5 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            #!/usr/bin/ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'optparse'
         
     | 
| 
       2 
3 
     | 
    
         
             
            require 'open-uri'
         
     | 
| 
       3 
4 
     | 
    
         
             
            require 'conf.rb'
         
     | 
| 
       4 
5 
     | 
    
         
             
            require 'downloader.rb'
         
     | 
| 
         @@ -8,6 +9,20 @@ require 'feedbooks.rb' 
     | 
|
| 
       8 
9 
     | 
    
         | 
| 
       9 
10 
     | 
    
         
             
            include HTML2FB
         
     | 
| 
       10 
11 
     | 
    
         | 
| 
      
 12 
     | 
    
         
            +
            options = {}
         
     | 
| 
      
 13 
     | 
    
         
            +
            options[:conf] = "conf.yaml"
         
     | 
| 
      
 14 
     | 
    
         
            +
            options[:preview] = true
         
     | 
| 
      
 15 
     | 
    
         
            +
            OptionParser.new do |opts|
         
     | 
| 
      
 16 
     | 
    
         
            +
            	opts.banner = "Usage: html2fb [options] URL"
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            	opts.on("-c", "--conf FILE", String,"Configuration file") do |f|
         
     | 
| 
      
 19 
     | 
    
         
            +
            		options[:conf] = f
         
     | 
| 
      
 20 
     | 
    
         
            +
            	end
         
     | 
| 
      
 21 
     | 
    
         
            +
            	opts.on("-s", "-s","Send to feedbooks") do |f|
         
     | 
| 
      
 22 
     | 
    
         
            +
            		options[:preview] = !f
         
     | 
| 
      
 23 
     | 
    
         
            +
            	end
         
     | 
| 
      
 24 
     | 
    
         
            +
            end.parse!
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
       11 
26 
     | 
    
         
             
            valid=false
         
     | 
| 
       12 
27 
     | 
    
         
             
            entry=ARGV[0]
         
     | 
| 
       13 
28 
     | 
    
         
             
            while !valid
         
     | 
| 
         @@ -23,9 +38,16 @@ while !valid 
     | 
|
| 
       23 
38 
     | 
    
         
             
            	print "URL : " if entry.nil? || entry==''
         
     | 
| 
       24 
39 
     | 
    
         
             
            	entry=STDIN.readline.strip unless valid
         
     | 
| 
       25 
40 
     | 
    
         
             
            end
         
     | 
| 
       26 
     | 
    
         
            -
            conf=Conf.new( 
     | 
| 
      
 41 
     | 
    
         
            +
            conf=Conf.new(options[:conf])
         
     | 
| 
       27 
42 
     | 
    
         
             
            content=Downloader.download(url)
         
     | 
| 
       28 
43 
     | 
    
         
             
            #puts content.size
         
     | 
| 
       29 
44 
     | 
    
         
             
            doc=Parser.new(conf).parse(content)
         
     | 
| 
       30 
45 
     | 
    
         
             
            puts doc.toc.to_yaml
         
     | 
| 
      
 46 
     | 
    
         
            +
            if options[:preview]
         
     | 
| 
      
 47 
     | 
    
         
            +
            	f=File.open('/tmp/plop.html','w')
         
     | 
| 
      
 48 
     | 
    
         
            +
            	f.write doc.to_html
         
     | 
| 
      
 49 
     | 
    
         
            +
            	f.close
         
     | 
| 
      
 50 
     | 
    
         
            +
            	`firefox /tmp/plop.html`
         
     | 
| 
      
 51 
     | 
    
         
            +
            else
         
     | 
| 
       31 
52 
     | 
    
         
             
            doc.to_feedbooks(conf)
         
     | 
| 
      
 53 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/app.rb
    CHANGED
    
    | 
         @@ -2,6 +2,7 @@ require 'digest/md5' 
     | 
|
| 
       2 
2 
     | 
    
         
             
            require 'open-uri'
         
     | 
| 
       3 
3 
     | 
    
         
             
            require 'net/http'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'time'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'htmlentities'
         
     | 
| 
       5 
6 
     | 
    
         | 
| 
       6 
7 
     | 
    
         
             
            class AtomPost
         
     | 
| 
       7 
8 
     | 
    
         
             
            	attr_accessor :title
         
     | 
| 
         @@ -25,11 +26,11 @@ class AtomPost 
     | 
|
| 
       25 
26 
     | 
    
         | 
| 
       26 
27 
     | 
    
         
             
            		req.body  = '<?xml version="1.0"?>'+"\n"
         
     | 
| 
       27 
28 
     | 
    
         
             
            		req.body  +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
         
     | 
| 
       28 
     | 
    
         
            -
            		req.body  +='<title>'+title+'</title>'+"\n"
         
     | 
| 
      
 29 
     | 
    
         
            +
            		req.body  +='<title>'+recode_text(title)+'</title>'+"\n"
         
     | 
| 
       29 
30 
     | 
    
         
             
            		req.body  +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
         
     | 
| 
       30 
31 
     | 
    
         
             
            		req.body  +='<updated>'+date.xmlschema+'</updated>'+"\n"
         
     | 
| 
       31 
32 
     | 
    
         
             
            		req.body  +='<author><name>'+author+'</name></author>'+"\n"
         
     | 
| 
       32 
     | 
    
         
            -
            		req.body  +='<content>'+content+'</content>'+"\n"
         
     | 
| 
      
 33 
     | 
    
         
            +
            		req.body  +='<content>'+recode_text(content)+'</content>'+"\n"
         
     | 
| 
       33 
34 
     | 
    
         
             
            		req.body  +='</entry>'+"\n"
         
     | 
| 
       34 
35 
     | 
    
         | 
| 
       35 
36 
     | 
    
         
             
            		req.set_content_type('application/atom+xml;type=entry')
         
     | 
| 
         @@ -46,4 +47,15 @@ class AtomPost 
     | 
|
| 
       46 
47 
     | 
    
         
             
            			res.error!
         
     | 
| 
       47 
48 
     | 
    
         
             
            		end
         
     | 
| 
       48 
49 
     | 
    
         
             
            	end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            	def recode_text(txt)
         
     | 
| 
      
 52 
     | 
    
         
            +
            		return txt if txt.blank?
         
     | 
| 
      
 53 
     | 
    
         
            +
            		m=Hpricot(txt)
         
     | 
| 
      
 54 
     | 
    
         
            +
            		m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
         
     | 
| 
      
 55 
     | 
    
         
            +
            		m.to_html
         
     | 
| 
      
 56 
     | 
    
         
            +
            	end
         
     | 
| 
      
 57 
     | 
    
         
            +
            	HTMLENCODER=HTMLEntities.new
         
     | 
| 
      
 58 
     | 
    
         
            +
            	def force_decimal_entities(txt)
         
     | 
| 
      
 59 
     | 
    
         
            +
            		HTMLENCODER.encode(HTMLENCODER.decode(txt),:decimal)
         
     | 
| 
      
 60 
     | 
    
         
            +
            	end
         
     | 
| 
       49 
61 
     | 
    
         
             
            end
         
     | 
    
        data/lib/conf.rb
    CHANGED
    
    | 
         @@ -2,7 +2,7 @@ 
     | 
|
| 
       2 
2 
     | 
    
         
             
            module HTML2FB
         
     | 
| 
       3 
3 
     | 
    
         
             
            	class Conf
         
     | 
| 
       4 
4 
     | 
    
         
             
            		def initialize(file)
         
     | 
| 
       5 
     | 
    
         
            -
            			['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
         
     | 
| 
      
 5 
     | 
    
         
            +
            			['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
         
     | 
| 
       6 
6 
     | 
    
         
             
            				f=p+file
         
     | 
| 
       7 
7 
     | 
    
         
             
            				begin
         
     | 
| 
       8 
8 
     | 
    
         
             
            					if File.readable?(f) && File.exists?(f)
         
     | 
    
        data/lib/feedbooks.rb
    CHANGED
    
    | 
         @@ -54,6 +54,7 @@ module HTML2FB 
     | 
|
| 
       54 
54 
     | 
    
         
             
            	class Section
         
     | 
| 
       55 
55 
     | 
    
         
             
            		@@level=0
         
     | 
| 
       56 
56 
     | 
    
         
             
            		def to_feedbooks(conf)
         
     | 
| 
      
 57 
     | 
    
         
            +
            			puts "Sending to feedbooks"
         
     | 
| 
       57 
58 
     | 
    
         
             
            			fb=FBSession.session
         
     | 
| 
       58 
59 
     | 
    
         
             
            			post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
         
     | 
| 
       59 
60 
     | 
    
         
             
            			doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
         
     | 
    
        data/lib/parser.rb
    CHANGED
    
    | 
         @@ -8,16 +8,27 @@ module HTML2FB 
     | 
|
| 
       8 
8 
     | 
    
         
             
            			@conf=conf
         
     | 
| 
       9 
9 
     | 
    
         
             
            		end
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
      
 11 
     | 
    
         
            +
            		def extract_text(n)
         
     | 
| 
      
 12 
     | 
    
         
            +
            			t=''
         
     | 
| 
      
 13 
     | 
    
         
            +
            			n.traverse_all_element do |e|
         
     | 
| 
      
 14 
     | 
    
         
            +
            				t+=e.content.to_s if e.is_a?(Hpricot::Text)
         
     | 
| 
      
 15 
     | 
    
         
            +
            			end
         
     | 
| 
      
 16 
     | 
    
         
            +
            			t
         
     | 
| 
      
 17 
     | 
    
         
            +
            		end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
       11 
19 
     | 
    
         
             
            		def parse(txt)
         
     | 
| 
      
 20 
     | 
    
         
            +
            			puts "Parsing HTML"
         
     | 
| 
       12 
21 
     | 
    
         
             
            			pdoc=Hpricot(txt)
         
     | 
| 
       13 
22 
     | 
    
         
             
            			doc=Document.new
         
     | 
| 
      
 23 
     | 
    
         
            +
            			puts "Removing garbage elements"
         
     | 
| 
       14 
24 
     | 
    
         
             
            			remove_objs(pdoc)
         
     | 
| 
       15 
25 
     | 
    
         
             
            			ti=pdoc.at('title')
         
     | 
| 
       16 
     | 
    
         
            -
            			doc.title= ti. 
     | 
| 
      
 26 
     | 
    
         
            +
            			doc.title= extract_text(ti).strip unless ti.nil?
         
     | 
| 
       17 
27 
     | 
    
         
             
            			#			pdoc.search('//h3').each do |e|
         
     | 
| 
       18 
28 
     | 
    
         
             
            			#				doc.content.push(e.inner_text)
         
     | 
| 
       19 
29 
     | 
    
         
             
            			#			end
         
     | 
| 
       20 
30 
     | 
    
         | 
| 
      
 31 
     | 
    
         
            +
            			puts "Building TOC"
         
     | 
| 
       21 
32 
     | 
    
         
             
            			parse_text(pdoc,doc)	
         
     | 
| 
       22 
33 
     | 
    
         | 
| 
       23 
34 
     | 
    
         
             
            			return doc
         
     | 
| 
         @@ -34,8 +45,10 @@ module HTML2FB 
     | 
|
| 
       34 
45 
     | 
    
         
             
            				end unless @conf['remove']['expr'].nil?
         
     | 
| 
       35 
46 
     | 
    
         
             
            				@conf['remove']['before'].each do |cl|
         
     | 
| 
       36 
47 
     | 
    
         
             
            					x=doc.at(cl)
         
     | 
| 
      
 48 
     | 
    
         
            +
            					if x
         
     | 
| 
       37 
49 
     | 
    
         
             
            					x.preceding.remove
         
     | 
| 
       38 
50 
     | 
    
         
             
            					x.parent.children.delete(x)
         
     | 
| 
      
 51 
     | 
    
         
            +
            					end
         
     | 
| 
       39 
52 
     | 
    
         
             
            				end unless @conf['remove']['before'].nil?
         
     | 
| 
       40 
53 
     | 
    
         
             
            				@conf['remove']['between'].each do |cl|
         
     | 
| 
       41 
54 
     | 
    
         
             
            #					puts "between "+cl.inspect
         
     | 
| 
         @@ -43,11 +56,13 @@ module HTML2FB 
     | 
|
| 
       43 
56 
     | 
    
         
             
            				end unless @conf['remove']['between'].nil?
         
     | 
| 
       44 
57 
     | 
    
         
             
            				@conf['remove']['after'].each do |cl|
         
     | 
| 
       45 
58 
     | 
    
         
             
            					x=doc.at(cl)
         
     | 
| 
      
 59 
     | 
    
         
            +
            					if x
         
     | 
| 
       46 
60 
     | 
    
         
             
            					x.following.remove
         
     | 
| 
       47 
61 
     | 
    
         
             
            					x.parent.children.delete(x)
         
     | 
| 
      
 62 
     | 
    
         
            +
            					end
         
     | 
| 
       48 
63 
     | 
    
         
             
            				end unless @conf['remove']['after'].nil?
         
     | 
| 
       49 
64 
     | 
    
         
             
            			end
         
     | 
| 
       50 
     | 
    
         
            -
            			File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
         
     | 
| 
      
 65 
     | 
    
         
            +
            #			File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
         
     | 
| 
       51 
66 
     | 
    
         
             
            		end
         
     | 
| 
       52 
67 
     | 
    
         | 
| 
       53 
68 
     | 
    
         
             
            		def parse_text(doc,ret)
         
     | 
| 
         @@ -59,7 +74,8 @@ module HTML2FB 
     | 
|
| 
       59 
74 
     | 
    
         
             
            				tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
         
     | 
| 
       60 
75 
     | 
    
         
             
            				tmp.sub!(a.first.to_original_html,'')
         
     | 
| 
       61 
76 
     | 
    
         
             
            				s.content =[Text.new(tmp)]
         
     | 
| 
       62 
     | 
    
         
            -
            				 
     | 
| 
      
 77 
     | 
    
         
            +
            				#buggy with entities
         
     | 
| 
      
 78 
     | 
    
         
            +
            				s.title = extract_text(a.first)
         
     | 
| 
       63 
79 
     | 
    
         
             
            				ret.content.push s
         
     | 
| 
       64 
80 
     | 
    
         | 
| 
       65 
81 
     | 
    
         
             
            			end
         
     | 
| 
         @@ -88,7 +104,7 @@ module HTML2FB 
     | 
|
| 
       88 
104 
     | 
    
         
             
            							s=Section.new
         
     | 
| 
       89 
105 
     | 
    
         
             
            							tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
         
     | 
| 
       90 
106 
     | 
    
         
             
            							s.content = [Text.new(tmp)]
         
     | 
| 
       91 
     | 
    
         
            -
            							s.title = a.first 
     | 
| 
      
 107 
     | 
    
         
            +
            							s.title = extract_text(a.first)
         
     | 
| 
       92 
108 
     | 
    
         
             
            							el.content.push s
         
     | 
| 
       93 
109 
     | 
    
         
             
            							l.content.sub!(tmp,'')
         
     | 
| 
       94 
110 
     | 
    
         
             
            							l.content.sub!(a.first.to_original_html,'')
         
     | 
| 
         @@ -117,13 +133,25 @@ end 
     | 
|
| 
       117 
133 
     | 
    
         
             
            module Hpricot::Traverse
         
     | 
| 
       118 
134 
     | 
    
         
             
            	def between(i,j)
         
     | 
| 
       119 
135 
     | 
    
         
             
            		#puts i,j
         
     | 
| 
       120 
     | 
    
         
            -
            		unless j.nil?
         
     | 
| 
       121 
     | 
    
         
            -
            			prec=self.at(i). 
     | 
| 
       122 
     | 
    
         
            -
            			Hpricot::Elements[*self.at(j). 
     | 
| 
      
 136 
     | 
    
         
            +
            		unless j.nil? || self.at(j).nil?
         
     | 
| 
      
 137 
     | 
    
         
            +
            			prec=self.at(i).deep_preceding
         
     | 
| 
      
 138 
     | 
    
         
            +
            			Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
         
     | 
| 
       123 
139 
     | 
    
         
             
            		else
         
     | 
| 
       124 
     | 
    
         
            -
            			self.at(i). 
     | 
| 
      
 140 
     | 
    
         
            +
            			self.at(i).deep_following unless self.at(i).nil?
         
     | 
| 
       125 
141 
     | 
    
         
             
            		end
         
     | 
| 
       126 
142 
     | 
    
         
             
            	end
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
            	def deep_preceding()
         
     | 
| 
      
 145 
     | 
    
         
            +
            	ret=Hpricot::Elements[]
         
     | 
| 
      
 146 
     | 
    
         
            +
            	ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
         
     | 
| 
      
 147 
     | 
    
         
            +
            	ret+=preceding
         
     | 
| 
      
 148 
     | 
    
         
            +
            	Hpricot::Elements[*ret]
         
     | 
| 
      
 149 
     | 
    
         
            +
            	end
         
     | 
| 
      
 150 
     | 
    
         
            +
            	def deep_following()
         
     | 
| 
      
 151 
     | 
    
         
            +
            	ret=following
         
     | 
| 
      
 152 
     | 
    
         
            +
            	ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
         
     | 
| 
      
 153 
     | 
    
         
            +
            	Hpricot::Elements[*ret]
         
     | 
| 
      
 154 
     | 
    
         
            +
            	end
         
     | 
| 
       127 
155 
     | 
    
         
             
            end
         
     | 
| 
       128 
156 
     | 
    
         | 
| 
       129 
157 
     | 
    
         | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification 
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: zetaben-Html2Feedbooks
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version 
         
     | 
| 
       4 
     | 
    
         
            -
              version: "0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: "0.2"
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors: 
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Benoit Larroque
         
     | 
| 
         @@ -22,6 +22,16 @@ dependencies: 
     | 
|
| 
       22 
22 
     | 
    
         
             
                  - !ruby/object:Gem::Version 
         
     | 
| 
       23 
23 
     | 
    
         
             
                    version: "0.6"
         
     | 
| 
       24 
24 
     | 
    
         
             
                version: 
         
     | 
| 
      
 25 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency 
         
     | 
| 
      
 26 
     | 
    
         
            +
              name: htmlentities
         
     | 
| 
      
 27 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 28 
     | 
    
         
            +
              version_requirement: 
         
     | 
| 
      
 29 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 30 
     | 
    
         
            +
                requirements: 
         
     | 
| 
      
 31 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 32 
     | 
    
         
            +
                  - !ruby/object:Gem::Version 
         
     | 
| 
      
 33 
     | 
    
         
            +
                    version: "4.0"
         
     | 
| 
      
 34 
     | 
    
         
            +
                version: 
         
     | 
| 
       25 
35 
     | 
    
         
             
            description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
         
     | 
| 
       26 
36 
     | 
    
         
             
            email: zeta dot ben at gmail dot com
         
     | 
| 
       27 
37 
     | 
    
         
             
            executables: 
         
     |