zetaben-Html2Feedbooks 0.4.4 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,11 +11,11 @@ remove:
11
11
  - hr
12
12
 
13
13
  select:
14
- expr: h3
15
- fblevel: Chapter
16
- select:
17
- expr: h4
18
- fblevel: Section
14
+ - expr: h2
15
+ fblevel: Part
16
+ select:
17
+ - expr: h3
18
+ fblevel: Chapter
19
19
 
20
20
  fb:
21
21
  user: #ask#
@@ -24,11 +24,11 @@ module HTML2FB
24
24
  def titles
25
25
  tit=[]
26
26
  content.each do |f|
27
- if f.is_a?Section
28
- tit.push f.decorated_title
29
- else
30
- tit.push '#text'
31
- end
27
+ # if f.is_a?Section
28
+ tit.push f.titles
29
+ # else
30
+ # tit.push '#text'
31
+ # end
32
32
  end
33
33
 
34
34
  return [decorated_title,tit]
@@ -42,7 +42,9 @@ module HTML2FB
42
42
  class Document < Section
43
43
  def toc
44
44
  #return content
45
- return content.collect{|a|a.titles}
45
+ return content.collect{|a|
46
+ a.titles
47
+ }
46
48
  end
47
49
 
48
50
  end
@@ -61,5 +63,9 @@ module HTML2FB
61
63
  def to_s
62
64
  @content
63
65
  end
66
+
67
+ def titles
68
+ return ['#text']
69
+ end
64
70
  end
65
71
  end
@@ -93,11 +93,10 @@ module HTML2FB
93
93
 
94
94
  def to_html
95
95
  ret=nil
96
- if @@level==1
97
- ret=old_to_html
98
- else
99
- ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"+old_to_html
100
- end
96
+ ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"
97
+ @@level+=1
98
+ ret+=old_to_html
99
+ @@level-=1
101
100
  ret
102
101
  end
103
102
  end
@@ -1,5 +1,8 @@
1
1
  require 'hpricot'
2
2
  require 'document.rb'
3
+ require 'progressbar'
4
+ #require 'term/ansicolor'
5
+ #include Term::ANSIColor
3
6
 
4
7
  module HTML2FB
5
8
  class Parser
@@ -8,14 +11,6 @@ module HTML2FB
8
11
  @conf=conf
9
12
  end
10
13
 
11
- def extract_text(n)
12
- t=''
13
- n.traverse_all_element do |e|
14
- t+=e.content.to_s if e.is_a?(Hpricot::Text)
15
- end
16
- t
17
- end
18
-
19
14
  def parse(txt)
20
15
  puts "Parsing HTML"
21
16
  pdoc=Hpricot(txt)
@@ -23,7 +18,7 @@ module HTML2FB
23
18
  puts "Removing garbage elements"
24
19
  remove_objs(pdoc)
25
20
  ti=pdoc.at('title')
26
- doc.title= extract_text(ti).strip unless ti.nil?
21
+ doc.title= ti.extract_text.strip unless ti.nil?
27
22
  # pdoc.search('//h3').each do |e|
28
23
  # doc.content.push(e.inner_text)
29
24
  # end
@@ -31,6 +26,8 @@ module HTML2FB
31
26
  puts "Building TOC"
32
27
  parse_text(pdoc,doc)
33
28
 
29
+ # puts green(bold(doc.pretty_inspect))
30
+
34
31
  return doc
35
32
  end
36
33
  protected
@@ -66,60 +63,119 @@ module HTML2FB
66
63
  end
67
64
 
68
65
  def parse_text(doc,ret)
69
- ti = doc.search('//'+@conf['select']['expr'])
70
- if ti.nil?
71
- STDERR.puts "No #{@conf['select']['expr']} found"
72
- return
66
+ aut=build_autom(@conf['select'],ret)
67
+
68
+ pbar = ProgressBar.new("Parsing", doc.search('//').size)
69
+ doc.traverse_all_element do |el|
70
+ aut.feed(el)
71
+ pbar.inc
73
72
  end
74
- tit = ti.zip ti[1..-1]+[nil]
75
-
76
- tit.each do |a|
77
- s=Section.new
78
- s.fblevel=@conf['select']['fblevel']
79
- tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}.join
80
- tmp.sub!(a.first.to_original_html,'')
81
- s.content =[Text.new(tmp)]
82
- #buggy with entities
83
- s.title = extract_text(a.first)
84
- ret.content.push s
85
-
73
+ pbar.finish
74
+ aut.finish(doc)
75
+ end
76
+
77
+ protected
78
+
79
+ def build_autom(conf_tab,doc)
80
+ mach=StateMachine.new
81
+ build_rec(mach,conf_tab)
82
+ mach.reset(doc)
83
+ mach
84
+ end
85
+
86
+ def build_rec(mach,conf_tab)
87
+ return if conf_tab.size < 1
88
+ exprs=conf_tab.collect{|e| e.reject{|k,v| k=='select'} }
89
+ mach.add_level(exprs)
90
+ build_rec(mach,conf_tab.collect{|e| e['select'] }.flatten.reject{|a|a.nil?})
91
+ end
92
+ end
93
+
94
+ class StateMachine
95
+
96
+ def initialize
97
+ @levels=[]
98
+ @current_level=0
99
+ @starts=[]
100
+ @done=[]
101
+ @max_level=0
102
+ @content=nil
103
+ end
104
+
105
+ def add_level(tab)
106
+ tab=[tab] unless tab.is_a?Array
107
+ @levels.push tab
108
+ @current_level+=1
109
+ end
110
+
111
+ def reset(doc)
112
+ @current_level=0
113
+ @max_level=@levels.size
114
+ @starts[0]=doc
115
+ @content='body'
116
+ end
117
+
118
+ def inspect
119
+ @levels.inspect+"\n"+@current_level.to_s+"\n\n"+@done.inspect
120
+ end
121
+
122
+ def create_fbsection(title,fblevel)
123
+ s=Section.new
124
+ s.fblevel=fblevel
125
+ s.title = title
126
+ s
127
+ end
128
+
129
+ def create_textNode(txt)
130
+ Text.new(txt)
131
+ end
132
+
133
+ def finish(doc)
134
+ unless @content.nil?
135
+ # t=create_textNode(doc.root.search(@content...doc.children.last.xpath))
136
+ t=create_textNode(doc.at(@content).following.to_html)
137
+ @starts[@current_level].content.push(t)
86
138
  end
139
+ (1..@max_level).to_a.reverse.each do |l|
140
+ close_section(l)
141
+ end
142
+ @starts[0]
143
+ end
87
144
 
88
- if @conf['select']['select']
89
- conf=@conf['select']
90
- parse_rec(ret,conf)
145
+ def open_section(obj,lvl,el)
146
+ # if @current_level < lvl
147
+ t=create_textNode((el.root.search(@content...(el.xpath))[1..-1].to_html))
148
+ @starts[@current_level].content.push(t)
149
+ # end
150
+ (lvl..@max_level).to_a.reverse.each do |l|
151
+ close_section(l)
91
152
  end
153
+ @starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
154
+ @content=obj[:xpath]
155
+ @current_level=lvl
92
156
  end
93
157
 
94
- protected
158
+ def close_section(lvl)
159
+ return if @starts[lvl].nil?
160
+ @starts[lvl-1].content.push @starts[lvl]
161
+ @starts[lvl]=nil
162
+ end
163
+
164
+ def feed(el)
165
+ return if el.is_a?Hpricot::Text
166
+ @done=[[]*@levels.size]
95
167
 
96
- def parse_rec(el,conf)
97
- return if conf.nil?
98
- if el.is_a?Section
99
- el.content.each do |l|
100
- if l.is_a?Section
101
- parse_rec(l,conf['select'])
102
- else
103
- doc=Hpricot(l.content)
104
- ti = doc.search('//'+conf['expr'])
105
- return if ti.size ==0
106
- tit = ti.zip ti[1..-1]+[nil]
107
-
108
- tit.each do |a|
109
- s=Section.new
110
- s.fblevel=conf['fblevel']
111
- tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}
112
-
113
- s.content = [Text.new(tmp.join)]
114
- s.title = extract_text(a.first)
115
- el.content.push s
116
- tmp.each{|t|l.content.sub!(t,'')}
117
- l.content.sub!(a.first.to_original_html,'')
118
- end
168
+ @levels.each_with_index do |lvl,i|
169
+ lvl.each do |expr|
170
+ #puts i.to_s+" "+el.inspect if el.in_search?(expr['expr'])
171
+ if el.in_search?(expr['expr'])
119
172
 
173
+
174
+ open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
120
175
  end
121
176
  end
122
177
  end
178
+
123
179
  end
124
180
  end
125
181
  end
@@ -138,36 +194,30 @@ class NilClass
138
194
  end
139
195
 
140
196
  module Hpricot::Traverse
141
- def between(i,j)
142
- #puts i,j
143
- unless j.nil? || self.at(j).nil?
144
- prec=self.at(i).deep_preceding
145
- Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
146
- else
147
- self.at(i).deep_following unless self.at(i).nil?
197
+ def in_search?(expr)
198
+ se_in=self.parent
199
+ if expr[0..1]=='/'
200
+ se_in=se_in.parent until se_in.parent.nil?
148
201
  end
202
+ se_in.search(expr).each do |el|
203
+ return true if el==self
204
+ end
205
+ # puts self.name+" "+expr
206
+ return false
149
207
  end
150
208
 
151
- def deep_preceding()
152
- ret=Hpricot::Elements[]
153
- ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
154
- ret+=preceding
155
- Hpricot::Elements[*ret]
156
- end
157
- def deep_following()
158
- ret=following
159
- ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
160
- Hpricot::Elements[*ret]
161
- end
162
- end
163
-
164
-
165
- class Hpricot::Elements
166
- def between(i,j)
167
- Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
209
+ def root
210
+ se_in=self
211
+ se_in=se_in.parent until se_in.parent.nil?
212
+ se_in
168
213
  end
169
214
 
170
- def -(a)
171
- Hpricot::Elements[*self.find_all{|el| !a.include?el}]
215
+ def extract_text
216
+ t=''
217
+ self.traverse_all_element do |e|
218
+ t+=e.content.to_s if e.is_a?(Hpricot::Text)
219
+ end
220
+ t
172
221
  end
173
222
  end
223
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zetaben-Html2Feedbooks
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benoit Larroque
@@ -42,6 +42,16 @@ dependencies:
42
42
  - !ruby/object:Gem::Version
43
43
  version: "0.3"
44
44
  version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: progressbar
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.0.3
54
+ version:
45
55
  description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
46
56
  email: zeta dot ben at gmail dot com
47
57
  executables:
@@ -62,6 +72,7 @@ files:
62
72
  - lib/parser.rb
63
73
  has_rdoc: true
64
74
  homepage: http://github.com/Html2Feedbooks
75
+ licenses:
65
76
  post_install_message:
66
77
  rdoc_options: []
67
78
 
@@ -82,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
93
  requirements: []
83
94
 
84
95
  rubyforge_project:
85
- rubygems_version: 1.2.0
96
+ rubygems_version: 1.3.5
86
97
  signing_key:
87
98
  specification_version: 2
88
99
  summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com