zetaben-Html2Feedbooks 0.4.4 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/confs/conf.yaml +5 -5
- data/lib/document.rb +12 -6
- data/lib/feedbooks.rb +4 -5
- data/lib/parser.rb +128 -78
- metadata +13 -2
    
        data/confs/conf.yaml
    CHANGED
    
    
    
        data/lib/document.rb
    CHANGED
    
    | @@ -24,11 +24,11 @@ module HTML2FB | |
| 24 24 | 
             
            		def titles
         | 
| 25 25 | 
             
            			tit=[]
         | 
| 26 26 | 
             
            			content.each do |f|
         | 
| 27 | 
            -
            				if f.is_a?Section
         | 
| 28 | 
            -
            					tit.push f. | 
| 29 | 
            -
            				else
         | 
| 30 | 
            -
            					tit.push '#text'
         | 
| 31 | 
            -
            				end
         | 
| 27 | 
            +
            #				if f.is_a?Section
         | 
| 28 | 
            +
            					tit.push f.titles
         | 
| 29 | 
            +
            #				else
         | 
| 30 | 
            +
            #					tit.push '#text'
         | 
| 31 | 
            +
            #				end
         | 
| 32 32 | 
             
            			end
         | 
| 33 33 |  | 
| 34 34 | 
             
            			return [decorated_title,tit]
         | 
| @@ -42,7 +42,9 @@ module HTML2FB | |
| 42 42 | 
             
            	class Document < Section
         | 
| 43 43 | 
             
            		def toc
         | 
| 44 44 | 
             
            			#return content
         | 
| 45 | 
            -
            			return content.collect{|a| | 
| 45 | 
            +
            			return content.collect{|a|
         | 
| 46 | 
            +
            				a.titles
         | 
| 47 | 
            +
            			}
         | 
| 46 48 | 
             
            		end
         | 
| 47 49 |  | 
| 48 50 | 
             
            	end
         | 
| @@ -61,5 +63,9 @@ module HTML2FB | |
| 61 63 | 
             
            		def to_s
         | 
| 62 64 | 
             
            			@content
         | 
| 63 65 | 
             
            		end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            		def titles
         | 
| 68 | 
            +
            			return ['#text']
         | 
| 69 | 
            +
            		end
         | 
| 64 70 | 
             
            	end
         | 
| 65 71 | 
             
            end
         | 
    
        data/lib/feedbooks.rb
    CHANGED
    
    | @@ -93,11 +93,10 @@ module HTML2FB | |
| 93 93 |  | 
| 94 94 | 
             
            		def to_html
         | 
| 95 95 | 
             
            			ret=nil
         | 
| 96 | 
            -
            			 | 
| 97 | 
            -
             | 
| 98 | 
            -
            			 | 
| 99 | 
            -
             | 
| 100 | 
            -
            			end
         | 
| 96 | 
            +
            			ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"
         | 
| 97 | 
            +
            			@@level+=1
         | 
| 98 | 
            +
            			ret+=old_to_html
         | 
| 99 | 
            +
            			@@level-=1
         | 
| 101 100 | 
             
            			ret
         | 
| 102 101 | 
             
            		end
         | 
| 103 102 | 
             
            	end
         | 
    
        data/lib/parser.rb
    CHANGED
    
    | @@ -1,5 +1,8 @@ | |
| 1 1 | 
             
            require 'hpricot'
         | 
| 2 2 | 
             
            require 'document.rb'
         | 
| 3 | 
            +
            require 'progressbar'
         | 
| 4 | 
            +
            #require 'term/ansicolor'
         | 
| 5 | 
            +
            #include Term::ANSIColor
         | 
| 3 6 |  | 
| 4 7 | 
             
            module HTML2FB
         | 
| 5 8 | 
             
            	class Parser
         | 
| @@ -8,14 +11,6 @@ module HTML2FB | |
| 8 11 | 
             
            			@conf=conf
         | 
| 9 12 | 
             
            		end
         | 
| 10 13 |  | 
| 11 | 
            -
            		def extract_text(n)
         | 
| 12 | 
            -
            			t=''
         | 
| 13 | 
            -
            			n.traverse_all_element do |e|
         | 
| 14 | 
            -
            				t+=e.content.to_s if e.is_a?(Hpricot::Text)
         | 
| 15 | 
            -
            			end
         | 
| 16 | 
            -
            			t
         | 
| 17 | 
            -
            		end
         | 
| 18 | 
            -
             | 
| 19 14 | 
             
            		def parse(txt)
         | 
| 20 15 | 
             
            			puts "Parsing HTML"
         | 
| 21 16 | 
             
            			pdoc=Hpricot(txt)
         | 
| @@ -23,7 +18,7 @@ module HTML2FB | |
| 23 18 | 
             
            			puts "Removing garbage elements"
         | 
| 24 19 | 
             
            			remove_objs(pdoc)
         | 
| 25 20 | 
             
            			ti=pdoc.at('title')
         | 
| 26 | 
            -
            			doc.title=  | 
| 21 | 
            +
            			doc.title= ti.extract_text.strip unless ti.nil?
         | 
| 27 22 | 
             
            			#			pdoc.search('//h3').each do |e|
         | 
| 28 23 | 
             
            			#				doc.content.push(e.inner_text)
         | 
| 29 24 | 
             
            			#			end
         | 
| @@ -31,6 +26,8 @@ module HTML2FB | |
| 31 26 | 
             
            			puts "Building TOC"
         | 
| 32 27 | 
             
            			parse_text(pdoc,doc)	
         | 
| 33 28 |  | 
| 29 | 
            +
            #			puts green(bold(doc.pretty_inspect))
         | 
| 30 | 
            +
             | 
| 34 31 | 
             
            			return doc
         | 
| 35 32 | 
             
            		end
         | 
| 36 33 | 
             
            		protected
         | 
| @@ -66,60 +63,119 @@ module HTML2FB | |
| 66 63 | 
             
            		end
         | 
| 67 64 |  | 
| 68 65 | 
             
            		def parse_text(doc,ret)
         | 
| 69 | 
            -
            			 | 
| 70 | 
            -
            			 | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 66 | 
            +
            			aut=build_autom(@conf['select'],ret)
         | 
| 67 | 
            +
            			
         | 
| 68 | 
            +
            			pbar = ProgressBar.new("Parsing", doc.search('//').size)
         | 
| 69 | 
            +
            			doc.traverse_all_element do |el|
         | 
| 70 | 
            +
            			aut.feed(el)
         | 
| 71 | 
            +
            			pbar.inc
         | 
| 73 72 | 
             
            			end
         | 
| 74 | 
            -
            			 | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 73 | 
            +
            			pbar.finish
         | 
| 74 | 
            +
            			aut.finish(doc)
         | 
| 75 | 
            +
            		end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            		protected
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            		def build_autom(conf_tab,doc)
         | 
| 80 | 
            +
            			mach=StateMachine.new
         | 
| 81 | 
            +
            			build_rec(mach,conf_tab)
         | 
| 82 | 
            +
            			mach.reset(doc)
         | 
| 83 | 
            +
            			mach
         | 
| 84 | 
            +
            		end
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            		def build_rec(mach,conf_tab)
         | 
| 87 | 
            +
            			return if conf_tab.size < 1
         | 
| 88 | 
            +
            			exprs=conf_tab.collect{|e| e.reject{|k,v| k=='select'} }
         | 
| 89 | 
            +
            			mach.add_level(exprs)
         | 
| 90 | 
            +
            			build_rec(mach,conf_tab.collect{|e| e['select'] }.flatten.reject{|a|a.nil?})
         | 
| 91 | 
            +
            		end
         | 
| 92 | 
            +
            	end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            	class StateMachine
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            		def initialize
         | 
| 97 | 
            +
            			@levels=[]
         | 
| 98 | 
            +
            			@current_level=0
         | 
| 99 | 
            +
            			@starts=[]
         | 
| 100 | 
            +
            			@done=[]
         | 
| 101 | 
            +
            			@max_level=0
         | 
| 102 | 
            +
            			@content=nil
         | 
| 103 | 
            +
            		end
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            		def add_level(tab)
         | 
| 106 | 
            +
            			tab=[tab] unless tab.is_a?Array
         | 
| 107 | 
            +
            			@levels.push tab
         | 
| 108 | 
            +
            			@current_level+=1
         | 
| 109 | 
            +
            		end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            		def reset(doc)
         | 
| 112 | 
            +
            			@current_level=0
         | 
| 113 | 
            +
            			@max_level=@levels.size
         | 
| 114 | 
            +
            			@starts[0]=doc
         | 
| 115 | 
            +
            			@content='body'
         | 
| 116 | 
            +
            		end
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            		def inspect
         | 
| 119 | 
            +
            			@levels.inspect+"\n"+@current_level.to_s+"\n\n"+@done.inspect
         | 
| 120 | 
            +
            		end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            		def create_fbsection(title,fblevel)
         | 
| 123 | 
            +
            			s=Section.new
         | 
| 124 | 
            +
            			s.fblevel=fblevel
         | 
| 125 | 
            +
            			s.title = title
         | 
| 126 | 
            +
            			s
         | 
| 127 | 
            +
            		end
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            		def create_textNode(txt)
         | 
| 130 | 
            +
            			Text.new(txt)
         | 
| 131 | 
            +
            		end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            		def finish(doc)
         | 
| 134 | 
            +
            			unless @content.nil?
         | 
| 135 | 
            +
            			#	t=create_textNode(doc.root.search(@content...doc.children.last.xpath))
         | 
| 136 | 
            +
            				t=create_textNode(doc.at(@content).following.to_html)
         | 
| 137 | 
            +
            				@starts[@current_level].content.push(t)
         | 
| 86 138 | 
             
            			end
         | 
| 139 | 
            +
            			(1..@max_level).to_a.reverse.each do |l|
         | 
| 140 | 
            +
            				close_section(l)
         | 
| 141 | 
            +
            			end
         | 
| 142 | 
            +
            			@starts[0]
         | 
| 143 | 
            +
            		end
         | 
| 87 144 |  | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
            				 | 
| 145 | 
            +
            		def open_section(obj,lvl,el)
         | 
| 146 | 
            +
            		#	if @current_level < lvl
         | 
| 147 | 
            +
            				t=create_textNode((el.root.search(@content...(el.xpath))[1..-1].to_html))
         | 
| 148 | 
            +
            				@starts[@current_level].content.push(t)
         | 
| 149 | 
            +
            		#	end
         | 
| 150 | 
            +
            			(lvl..@max_level).to_a.reverse.each do |l|
         | 
| 151 | 
            +
            				close_section(l)
         | 
| 91 152 | 
             
            			end
         | 
| 153 | 
            +
            			@starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
         | 
| 154 | 
            +
            			@content=obj[:xpath]
         | 
| 155 | 
            +
            			@current_level=lvl
         | 
| 92 156 | 
             
            		end
         | 
| 93 157 |  | 
| 94 | 
            -
            		 | 
| 158 | 
            +
            		def close_section(lvl)
         | 
| 159 | 
            +
            			return if @starts[lvl].nil?
         | 
| 160 | 
            +
            			@starts[lvl-1].content.push @starts[lvl]
         | 
| 161 | 
            +
            			@starts[lvl]=nil
         | 
| 162 | 
            +
            		end
         | 
| 163 | 
            +
             | 
| 164 | 
            +
            		def feed(el)
         | 
| 165 | 
            +
            			return if el.is_a?Hpricot::Text
         | 
| 166 | 
            +
            			@done=[[]*@levels.size]
         | 
| 95 167 |  | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
            					if l.is_a?Section
         | 
| 101 | 
            -
            						parse_rec(l,conf['select'])
         | 
| 102 | 
            -
            					else
         | 
| 103 | 
            -
            						doc=Hpricot(l.content)
         | 
| 104 | 
            -
            						ti  = doc.search('//'+conf['expr'])
         | 
| 105 | 
            -
            						return if ti.size ==0
         | 
| 106 | 
            -
            						tit = ti.zip ti[1..-1]+[nil]
         | 
| 107 | 
            -
             | 
| 108 | 
            -
            						tit.each do |a|
         | 
| 109 | 
            -
            							s=Section.new
         | 
| 110 | 
            -
            							s.fblevel=conf['fblevel']
         | 
| 111 | 
            -
            							tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}
         | 
| 112 | 
            -
            							
         | 
| 113 | 
            -
            							s.content = [Text.new(tmp.join)]
         | 
| 114 | 
            -
            							s.title = extract_text(a.first)
         | 
| 115 | 
            -
            							el.content.push s
         | 
| 116 | 
            -
            							tmp.each{|t|l.content.sub!(t,'')}
         | 
| 117 | 
            -
            							l.content.sub!(a.first.to_original_html,'')
         | 
| 118 | 
            -
            						end
         | 
| 168 | 
            +
            			@levels.each_with_index do  |lvl,i|
         | 
| 169 | 
            +
            				lvl.each do |expr|
         | 
| 170 | 
            +
            					#puts i.to_s+" "+el.inspect if el.in_search?(expr['expr'])
         | 
| 171 | 
            +
            					if el.in_search?(expr['expr'])
         | 
| 119 172 |  | 
| 173 | 
            +
             | 
| 174 | 
            +
            						open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
         | 
| 120 175 | 
             
            					end
         | 
| 121 176 | 
             
            				end
         | 
| 122 177 | 
             
            			end
         | 
| 178 | 
            +
             | 
| 123 179 | 
             
            		end
         | 
| 124 180 | 
             
            	end
         | 
| 125 181 | 
             
            end
         | 
| @@ -138,36 +194,30 @@ class NilClass | |
| 138 194 | 
             
            end
         | 
| 139 195 |  | 
| 140 196 | 
             
            module Hpricot::Traverse
         | 
| 141 | 
            -
            	def  | 
| 142 | 
            -
            		 | 
| 143 | 
            -
            		 | 
| 144 | 
            -
             | 
| 145 | 
            -
            			Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
         | 
| 146 | 
            -
            		else
         | 
| 147 | 
            -
            			self.at(i).deep_following unless self.at(i).nil?
         | 
| 197 | 
            +
            	def in_search?(expr)
         | 
| 198 | 
            +
            		se_in=self.parent
         | 
| 199 | 
            +
            		if expr[0..1]=='/'
         | 
| 200 | 
            +
            		se_in=se_in.parent until se_in.parent.nil?
         | 
| 148 201 | 
             
            		end
         | 
| 202 | 
            +
            		se_in.search(expr).each do |el|
         | 
| 203 | 
            +
            			return true if el==self
         | 
| 204 | 
            +
            		end
         | 
| 205 | 
            +
            #		puts self.name+" "+expr
         | 
| 206 | 
            +
            		return false
         | 
| 149 207 | 
             
            	end
         | 
| 150 208 |  | 
| 151 | 
            -
            	def  | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
            	Hpricot::Elements[*ret]
         | 
| 156 | 
            -
            	end
         | 
| 157 | 
            -
            	def deep_following()
         | 
| 158 | 
            -
            	ret=following
         | 
| 159 | 
            -
            	ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
         | 
| 160 | 
            -
            	Hpricot::Elements[*ret]
         | 
| 161 | 
            -
            	end
         | 
| 162 | 
            -
            end
         | 
| 163 | 
            -
             | 
| 164 | 
            -
             | 
| 165 | 
            -
            class Hpricot::Elements
         | 
| 166 | 
            -
            	def between(i,j)
         | 
| 167 | 
            -
            		Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
         | 
| 209 | 
            +
            	def root
         | 
| 210 | 
            +
            		se_in=self
         | 
| 211 | 
            +
            		se_in=se_in.parent until se_in.parent.nil?
         | 
| 212 | 
            +
            		se_in
         | 
| 168 213 | 
             
            	end
         | 
| 169 214 |  | 
| 170 | 
            -
            	def  | 
| 171 | 
            -
            		 | 
| 215 | 
            +
            	def extract_text
         | 
| 216 | 
            +
            		t=''
         | 
| 217 | 
            +
            		self.traverse_all_element do |e|
         | 
| 218 | 
            +
            			t+=e.content.to_s if e.is_a?(Hpricot::Text)
         | 
| 219 | 
            +
            		end
         | 
| 220 | 
            +
            		t
         | 
| 172 221 | 
             
            	end
         | 
| 173 222 | 
             
            end
         | 
| 223 | 
            +
             | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: zetaben-Html2Feedbooks
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 1.0.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 7 | 
             
            - Benoit Larroque
         | 
| @@ -42,6 +42,16 @@ dependencies: | |
| 42 42 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 43 43 | 
             
                    version: "0.3"
         | 
| 44 44 | 
             
                version: 
         | 
| 45 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 46 | 
            +
              name: progressbar
         | 
| 47 | 
            +
              type: :runtime
         | 
| 48 | 
            +
              version_requirement: 
         | 
| 49 | 
            +
              version_requirements: !ruby/object:Gem::Requirement 
         | 
| 50 | 
            +
                requirements: 
         | 
| 51 | 
            +
                - - ">="
         | 
| 52 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 53 | 
            +
                    version: 0.0.3
         | 
| 54 | 
            +
                version: 
         | 
| 45 55 | 
             
            description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
         | 
| 46 56 | 
             
            email: zeta dot ben at gmail dot com
         | 
| 47 57 | 
             
            executables: 
         | 
| @@ -62,6 +72,7 @@ files: | |
| 62 72 | 
             
            - lib/parser.rb
         | 
| 63 73 | 
             
            has_rdoc: true
         | 
| 64 74 | 
             
            homepage: http://github.com/Html2Feedbooks
         | 
| 75 | 
            +
            licenses: 
         | 
| 65 76 | 
             
            post_install_message: 
         | 
| 66 77 | 
             
            rdoc_options: []
         | 
| 67 78 |  | 
| @@ -82,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 82 93 | 
             
            requirements: []
         | 
| 83 94 |  | 
| 84 95 | 
             
            rubyforge_project: 
         | 
| 85 | 
            -
            rubygems_version: 1. | 
| 96 | 
            +
            rubygems_version: 1.3.5
         | 
| 86 97 | 
             
            signing_key: 
         | 
| 87 98 | 
             
            specification_version: 2
         | 
| 88 99 | 
             
            summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com
         |