zetaben-Html2Feedbooks 0.4.4 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,11 +11,11 @@ remove:
11
11
  - hr
12
12
 
13
13
  select:
14
- expr: h3
15
- fblevel: Chapter
16
- select:
17
- expr: h4
18
- fblevel: Section
14
+ - expr: h2
15
+ fblevel: Part
16
+ select:
17
+ - expr: h3
18
+ fblevel: Chapter
19
19
 
20
20
  fb:
21
21
  user: #ask#
@@ -24,11 +24,11 @@ module HTML2FB
24
24
  def titles
25
25
  tit=[]
26
26
  content.each do |f|
27
- if f.is_a?Section
28
- tit.push f.decorated_title
29
- else
30
- tit.push '#text'
31
- end
27
+ # if f.is_a?Section
28
+ tit.push f.titles
29
+ # else
30
+ # tit.push '#text'
31
+ # end
32
32
  end
33
33
 
34
34
  return [decorated_title,tit]
@@ -42,7 +42,9 @@ module HTML2FB
42
42
  class Document < Section
43
43
  def toc
44
44
  #return content
45
- return content.collect{|a|a.titles}
45
+ return content.collect{|a|
46
+ a.titles
47
+ }
46
48
  end
47
49
 
48
50
  end
@@ -61,5 +63,9 @@ module HTML2FB
61
63
  def to_s
62
64
  @content
63
65
  end
66
+
67
+ def titles
68
+ return ['#text']
69
+ end
64
70
  end
65
71
  end
@@ -93,11 +93,10 @@ module HTML2FB
93
93
 
94
94
  def to_html
95
95
  ret=nil
96
- if @@level==1
97
- ret=old_to_html
98
- else
99
- ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"+old_to_html
100
- end
96
+ ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"
97
+ @@level+=1
98
+ ret+=old_to_html
99
+ @@level-=1
101
100
  ret
102
101
  end
103
102
  end
@@ -1,5 +1,8 @@
1
1
  require 'hpricot'
2
2
  require 'document.rb'
3
+ require 'progressbar'
4
+ #require 'term/ansicolor'
5
+ #include Term::ANSIColor
3
6
 
4
7
  module HTML2FB
5
8
  class Parser
@@ -8,14 +11,6 @@ module HTML2FB
8
11
  @conf=conf
9
12
  end
10
13
 
11
- def extract_text(n)
12
- t=''
13
- n.traverse_all_element do |e|
14
- t+=e.content.to_s if e.is_a?(Hpricot::Text)
15
- end
16
- t
17
- end
18
-
19
14
  def parse(txt)
20
15
  puts "Parsing HTML"
21
16
  pdoc=Hpricot(txt)
@@ -23,7 +18,7 @@ module HTML2FB
23
18
  puts "Removing garbage elements"
24
19
  remove_objs(pdoc)
25
20
  ti=pdoc.at('title')
26
- doc.title= extract_text(ti).strip unless ti.nil?
21
+ doc.title= ti.extract_text.strip unless ti.nil?
27
22
  # pdoc.search('//h3').each do |e|
28
23
  # doc.content.push(e.inner_text)
29
24
  # end
@@ -31,6 +26,8 @@ module HTML2FB
31
26
  puts "Building TOC"
32
27
  parse_text(pdoc,doc)
33
28
 
29
+ # puts green(bold(doc.pretty_inspect))
30
+
34
31
  return doc
35
32
  end
36
33
  protected
@@ -66,60 +63,119 @@ module HTML2FB
66
63
  end
67
64
 
68
65
  def parse_text(doc,ret)
69
- ti = doc.search('//'+@conf['select']['expr'])
70
- if ti.nil?
71
- STDERR.puts "No #{@conf['select']['expr']} found"
72
- return
66
+ aut=build_autom(@conf['select'],ret)
67
+
68
+ pbar = ProgressBar.new("Parsing", doc.search('//').size)
69
+ doc.traverse_all_element do |el|
70
+ aut.feed(el)
71
+ pbar.inc
73
72
  end
74
- tit = ti.zip ti[1..-1]+[nil]
75
-
76
- tit.each do |a|
77
- s=Section.new
78
- s.fblevel=@conf['select']['fblevel']
79
- tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}.join
80
- tmp.sub!(a.first.to_original_html,'')
81
- s.content =[Text.new(tmp)]
82
- #buggy with entities
83
- s.title = extract_text(a.first)
84
- ret.content.push s
85
-
73
+ pbar.finish
74
+ aut.finish(doc)
75
+ end
76
+
77
+ protected
78
+
79
+ def build_autom(conf_tab,doc)
80
+ mach=StateMachine.new
81
+ build_rec(mach,conf_tab)
82
+ mach.reset(doc)
83
+ mach
84
+ end
85
+
86
+ def build_rec(mach,conf_tab)
87
+ return if conf_tab.size < 1
88
+ exprs=conf_tab.collect{|e| e.reject{|k,v| k=='select'} }
89
+ mach.add_level(exprs)
90
+ build_rec(mach,conf_tab.collect{|e| e['select'] }.flatten.reject{|a|a.nil?})
91
+ end
92
+ end
93
+
94
+ class StateMachine
95
+
96
+ def initialize
97
+ @levels=[]
98
+ @current_level=0
99
+ @starts=[]
100
+ @done=[]
101
+ @max_level=0
102
+ @content=nil
103
+ end
104
+
105
+ def add_level(tab)
106
+ tab=[tab] unless tab.is_a?Array
107
+ @levels.push tab
108
+ @current_level+=1
109
+ end
110
+
111
+ def reset(doc)
112
+ @current_level=0
113
+ @max_level=@levels.size
114
+ @starts[0]=doc
115
+ @content='body'
116
+ end
117
+
118
+ def inspect
119
+ @levels.inspect+"\n"+@current_level.to_s+"\n\n"+@done.inspect
120
+ end
121
+
122
+ def create_fbsection(title,fblevel)
123
+ s=Section.new
124
+ s.fblevel=fblevel
125
+ s.title = title
126
+ s
127
+ end
128
+
129
+ def create_textNode(txt)
130
+ Text.new(txt)
131
+ end
132
+
133
+ def finish(doc)
134
+ unless @content.nil?
135
+ # t=create_textNode(doc.root.search(@content...doc.children.last.xpath))
136
+ t=create_textNode(doc.at(@content).following.to_html)
137
+ @starts[@current_level].content.push(t)
86
138
  end
139
+ (1..@max_level).to_a.reverse.each do |l|
140
+ close_section(l)
141
+ end
142
+ @starts[0]
143
+ end
87
144
 
88
- if @conf['select']['select']
89
- conf=@conf['select']
90
- parse_rec(ret,conf)
145
+ def open_section(obj,lvl,el)
146
+ # if @current_level < lvl
147
+ t=create_textNode((el.root.search(@content...(el.xpath))[1..-1].to_html))
148
+ @starts[@current_level].content.push(t)
149
+ # end
150
+ (lvl..@max_level).to_a.reverse.each do |l|
151
+ close_section(l)
91
152
  end
153
+ @starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
154
+ @content=obj[:xpath]
155
+ @current_level=lvl
92
156
  end
93
157
 
94
- protected
158
+ def close_section(lvl)
159
+ return if @starts[lvl].nil?
160
+ @starts[lvl-1].content.push @starts[lvl]
161
+ @starts[lvl]=nil
162
+ end
163
+
164
+ def feed(el)
165
+ return if el.is_a?Hpricot::Text
166
+ @done=[[]*@levels.size]
95
167
 
96
- def parse_rec(el,conf)
97
- return if conf.nil?
98
- if el.is_a?Section
99
- el.content.each do |l|
100
- if l.is_a?Section
101
- parse_rec(l,conf['select'])
102
- else
103
- doc=Hpricot(l.content)
104
- ti = doc.search('//'+conf['expr'])
105
- return if ti.size ==0
106
- tit = ti.zip ti[1..-1]+[nil]
107
-
108
- tit.each do |a|
109
- s=Section.new
110
- s.fblevel=conf['fblevel']
111
- tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}
112
-
113
- s.content = [Text.new(tmp.join)]
114
- s.title = extract_text(a.first)
115
- el.content.push s
116
- tmp.each{|t|l.content.sub!(t,'')}
117
- l.content.sub!(a.first.to_original_html,'')
118
- end
168
+ @levels.each_with_index do |lvl,i|
169
+ lvl.each do |expr|
170
+ #puts i.to_s+" "+el.inspect if el.in_search?(expr['expr'])
171
+ if el.in_search?(expr['expr'])
119
172
 
173
+
174
+ open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
120
175
  end
121
176
  end
122
177
  end
178
+
123
179
  end
124
180
  end
125
181
  end
@@ -138,36 +194,30 @@ class NilClass
138
194
  end
139
195
 
140
196
  module Hpricot::Traverse
141
- def between(i,j)
142
- #puts i,j
143
- unless j.nil? || self.at(j).nil?
144
- prec=self.at(i).deep_preceding
145
- Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
146
- else
147
- self.at(i).deep_following unless self.at(i).nil?
197
+ def in_search?(expr)
198
+ se_in=self.parent
199
+ if expr[0..1]=='/'
200
+ se_in=se_in.parent until se_in.parent.nil?
148
201
  end
202
+ se_in.search(expr).each do |el|
203
+ return true if el==self
204
+ end
205
+ # puts self.name+" "+expr
206
+ return false
149
207
  end
150
208
 
151
- def deep_preceding()
152
- ret=Hpricot::Elements[]
153
- ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
154
- ret+=preceding
155
- Hpricot::Elements[*ret]
156
- end
157
- def deep_following()
158
- ret=following
159
- ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
160
- Hpricot::Elements[*ret]
161
- end
162
- end
163
-
164
-
165
- class Hpricot::Elements
166
- def between(i,j)
167
- Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
209
+ def root
210
+ se_in=self
211
+ se_in=se_in.parent until se_in.parent.nil?
212
+ se_in
168
213
  end
169
214
 
170
- def -(a)
171
- Hpricot::Elements[*self.find_all{|el| !a.include?el}]
215
+ def extract_text
216
+ t=''
217
+ self.traverse_all_element do |e|
218
+ t+=e.content.to_s if e.is_a?(Hpricot::Text)
219
+ end
220
+ t
172
221
  end
173
222
  end
223
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zetaben-Html2Feedbooks
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benoit Larroque
@@ -42,6 +42,16 @@ dependencies:
42
42
  - !ruby/object:Gem::Version
43
43
  version: "0.3"
44
44
  version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: progressbar
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 0.0.3
54
+ version:
45
55
  description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
46
56
  email: zeta dot ben at gmail dot com
47
57
  executables:
@@ -62,6 +72,7 @@ files:
62
72
  - lib/parser.rb
63
73
  has_rdoc: true
64
74
  homepage: http://github.com/Html2Feedbooks
75
+ licenses:
65
76
  post_install_message:
66
77
  rdoc_options: []
67
78
 
@@ -82,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
93
  requirements: []
83
94
 
84
95
  rubyforge_project:
85
- rubygems_version: 1.2.0
96
+ rubygems_version: 1.3.5
86
97
  signing_key:
87
98
  specification_version: 2
88
99
  summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com