zetaben-Html2Feedbooks 0.4.4 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/confs/conf.yaml +5 -5
- data/lib/document.rb +12 -6
- data/lib/feedbooks.rb +4 -5
- data/lib/parser.rb +128 -78
- metadata +13 -2
data/confs/conf.yaml
CHANGED
data/lib/document.rb
CHANGED
@@ -24,11 +24,11 @@ module HTML2FB
|
|
24
24
|
def titles
|
25
25
|
tit=[]
|
26
26
|
content.each do |f|
|
27
|
-
if f.is_a?Section
|
28
|
-
tit.push f.
|
29
|
-
else
|
30
|
-
tit.push '#text'
|
31
|
-
end
|
27
|
+
# if f.is_a?Section
|
28
|
+
tit.push f.titles
|
29
|
+
# else
|
30
|
+
# tit.push '#text'
|
31
|
+
# end
|
32
32
|
end
|
33
33
|
|
34
34
|
return [decorated_title,tit]
|
@@ -42,7 +42,9 @@ module HTML2FB
|
|
42
42
|
class Document < Section
|
43
43
|
def toc
|
44
44
|
#return content
|
45
|
-
return content.collect{|a|
|
45
|
+
return content.collect{|a|
|
46
|
+
a.titles
|
47
|
+
}
|
46
48
|
end
|
47
49
|
|
48
50
|
end
|
@@ -61,5 +63,9 @@ module HTML2FB
|
|
61
63
|
def to_s
|
62
64
|
@content
|
63
65
|
end
|
66
|
+
|
67
|
+
def titles
|
68
|
+
return ['#text']
|
69
|
+
end
|
64
70
|
end
|
65
71
|
end
|
data/lib/feedbooks.rb
CHANGED
@@ -93,11 +93,10 @@ module HTML2FB
|
|
93
93
|
|
94
94
|
def to_html
|
95
95
|
ret=nil
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
end
|
96
|
+
ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"
|
97
|
+
@@level+=1
|
98
|
+
ret+=old_to_html
|
99
|
+
@@level-=1
|
101
100
|
ret
|
102
101
|
end
|
103
102
|
end
|
data/lib/parser.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require 'hpricot'
|
2
2
|
require 'document.rb'
|
3
|
+
require 'progressbar'
|
4
|
+
#require 'term/ansicolor'
|
5
|
+
#include Term::ANSIColor
|
3
6
|
|
4
7
|
module HTML2FB
|
5
8
|
class Parser
|
@@ -8,14 +11,6 @@ module HTML2FB
|
|
8
11
|
@conf=conf
|
9
12
|
end
|
10
13
|
|
11
|
-
def extract_text(n)
|
12
|
-
t=''
|
13
|
-
n.traverse_all_element do |e|
|
14
|
-
t+=e.content.to_s if e.is_a?(Hpricot::Text)
|
15
|
-
end
|
16
|
-
t
|
17
|
-
end
|
18
|
-
|
19
14
|
def parse(txt)
|
20
15
|
puts "Parsing HTML"
|
21
16
|
pdoc=Hpricot(txt)
|
@@ -23,7 +18,7 @@ module HTML2FB
|
|
23
18
|
puts "Removing garbage elements"
|
24
19
|
remove_objs(pdoc)
|
25
20
|
ti=pdoc.at('title')
|
26
|
-
doc.title=
|
21
|
+
doc.title= ti.extract_text.strip unless ti.nil?
|
27
22
|
# pdoc.search('//h3').each do |e|
|
28
23
|
# doc.content.push(e.inner_text)
|
29
24
|
# end
|
@@ -31,6 +26,8 @@ module HTML2FB
|
|
31
26
|
puts "Building TOC"
|
32
27
|
parse_text(pdoc,doc)
|
33
28
|
|
29
|
+
# puts green(bold(doc.pretty_inspect))
|
30
|
+
|
34
31
|
return doc
|
35
32
|
end
|
36
33
|
protected
|
@@ -66,60 +63,119 @@ module HTML2FB
|
|
66
63
|
end
|
67
64
|
|
68
65
|
def parse_text(doc,ret)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
66
|
+
aut=build_autom(@conf['select'],ret)
|
67
|
+
|
68
|
+
pbar = ProgressBar.new("Parsing", doc.search('//').size)
|
69
|
+
doc.traverse_all_element do |el|
|
70
|
+
aut.feed(el)
|
71
|
+
pbar.inc
|
73
72
|
end
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
73
|
+
pbar.finish
|
74
|
+
aut.finish(doc)
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def build_autom(conf_tab,doc)
|
80
|
+
mach=StateMachine.new
|
81
|
+
build_rec(mach,conf_tab)
|
82
|
+
mach.reset(doc)
|
83
|
+
mach
|
84
|
+
end
|
85
|
+
|
86
|
+
def build_rec(mach,conf_tab)
|
87
|
+
return if conf_tab.size < 1
|
88
|
+
exprs=conf_tab.collect{|e| e.reject{|k,v| k=='select'} }
|
89
|
+
mach.add_level(exprs)
|
90
|
+
build_rec(mach,conf_tab.collect{|e| e['select'] }.flatten.reject{|a|a.nil?})
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class StateMachine
|
95
|
+
|
96
|
+
def initialize
|
97
|
+
@levels=[]
|
98
|
+
@current_level=0
|
99
|
+
@starts=[]
|
100
|
+
@done=[]
|
101
|
+
@max_level=0
|
102
|
+
@content=nil
|
103
|
+
end
|
104
|
+
|
105
|
+
def add_level(tab)
|
106
|
+
tab=[tab] unless tab.is_a?Array
|
107
|
+
@levels.push tab
|
108
|
+
@current_level+=1
|
109
|
+
end
|
110
|
+
|
111
|
+
def reset(doc)
|
112
|
+
@current_level=0
|
113
|
+
@max_level=@levels.size
|
114
|
+
@starts[0]=doc
|
115
|
+
@content='body'
|
116
|
+
end
|
117
|
+
|
118
|
+
def inspect
|
119
|
+
@levels.inspect+"\n"+@current_level.to_s+"\n\n"+@done.inspect
|
120
|
+
end
|
121
|
+
|
122
|
+
def create_fbsection(title,fblevel)
|
123
|
+
s=Section.new
|
124
|
+
s.fblevel=fblevel
|
125
|
+
s.title = title
|
126
|
+
s
|
127
|
+
end
|
128
|
+
|
129
|
+
def create_textNode(txt)
|
130
|
+
Text.new(txt)
|
131
|
+
end
|
132
|
+
|
133
|
+
def finish(doc)
|
134
|
+
unless @content.nil?
|
135
|
+
# t=create_textNode(doc.root.search(@content...doc.children.last.xpath))
|
136
|
+
t=create_textNode(doc.at(@content).following.to_html)
|
137
|
+
@starts[@current_level].content.push(t)
|
86
138
|
end
|
139
|
+
(1..@max_level).to_a.reverse.each do |l|
|
140
|
+
close_section(l)
|
141
|
+
end
|
142
|
+
@starts[0]
|
143
|
+
end
|
87
144
|
|
88
|
-
|
89
|
-
|
90
|
-
|
145
|
+
def open_section(obj,lvl,el)
|
146
|
+
# if @current_level < lvl
|
147
|
+
t=create_textNode((el.root.search(@content...(el.xpath))[1..-1].to_html))
|
148
|
+
@starts[@current_level].content.push(t)
|
149
|
+
# end
|
150
|
+
(lvl..@max_level).to_a.reverse.each do |l|
|
151
|
+
close_section(l)
|
91
152
|
end
|
153
|
+
@starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
|
154
|
+
@content=obj[:xpath]
|
155
|
+
@current_level=lvl
|
92
156
|
end
|
93
157
|
|
94
|
-
|
158
|
+
def close_section(lvl)
|
159
|
+
return if @starts[lvl].nil?
|
160
|
+
@starts[lvl-1].content.push @starts[lvl]
|
161
|
+
@starts[lvl]=nil
|
162
|
+
end
|
163
|
+
|
164
|
+
def feed(el)
|
165
|
+
return if el.is_a?Hpricot::Text
|
166
|
+
@done=[[]*@levels.size]
|
95
167
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
if l.is_a?Section
|
101
|
-
parse_rec(l,conf['select'])
|
102
|
-
else
|
103
|
-
doc=Hpricot(l.content)
|
104
|
-
ti = doc.search('//'+conf['expr'])
|
105
|
-
return if ti.size ==0
|
106
|
-
tit = ti.zip ti[1..-1]+[nil]
|
107
|
-
|
108
|
-
tit.each do |a|
|
109
|
-
s=Section.new
|
110
|
-
s.fblevel=conf['fblevel']
|
111
|
-
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}
|
112
|
-
|
113
|
-
s.content = [Text.new(tmp.join)]
|
114
|
-
s.title = extract_text(a.first)
|
115
|
-
el.content.push s
|
116
|
-
tmp.each{|t|l.content.sub!(t,'')}
|
117
|
-
l.content.sub!(a.first.to_original_html,'')
|
118
|
-
end
|
168
|
+
@levels.each_with_index do |lvl,i|
|
169
|
+
lvl.each do |expr|
|
170
|
+
#puts i.to_s+" "+el.inspect if el.in_search?(expr['expr'])
|
171
|
+
if el.in_search?(expr['expr'])
|
119
172
|
|
173
|
+
|
174
|
+
open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
|
120
175
|
end
|
121
176
|
end
|
122
177
|
end
|
178
|
+
|
123
179
|
end
|
124
180
|
end
|
125
181
|
end
|
@@ -138,36 +194,30 @@ class NilClass
|
|
138
194
|
end
|
139
195
|
|
140
196
|
module Hpricot::Traverse
|
141
|
-
def
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
|
146
|
-
else
|
147
|
-
self.at(i).deep_following unless self.at(i).nil?
|
197
|
+
def in_search?(expr)
|
198
|
+
se_in=self.parent
|
199
|
+
if expr[0..1]=='/'
|
200
|
+
se_in=se_in.parent until se_in.parent.nil?
|
148
201
|
end
|
202
|
+
se_in.search(expr).each do |el|
|
203
|
+
return true if el==self
|
204
|
+
end
|
205
|
+
# puts self.name+" "+expr
|
206
|
+
return false
|
149
207
|
end
|
150
208
|
|
151
|
-
def
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
Hpricot::Elements[*ret]
|
156
|
-
end
|
157
|
-
def deep_following()
|
158
|
-
ret=following
|
159
|
-
ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
|
160
|
-
Hpricot::Elements[*ret]
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
|
165
|
-
class Hpricot::Elements
|
166
|
-
def between(i,j)
|
167
|
-
Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
|
209
|
+
def root
|
210
|
+
se_in=self
|
211
|
+
se_in=se_in.parent until se_in.parent.nil?
|
212
|
+
se_in
|
168
213
|
end
|
169
214
|
|
170
|
-
def
|
171
|
-
|
215
|
+
def extract_text
|
216
|
+
t=''
|
217
|
+
self.traverse_all_element do |e|
|
218
|
+
t+=e.content.to_s if e.is_a?(Hpricot::Text)
|
219
|
+
end
|
220
|
+
t
|
172
221
|
end
|
173
222
|
end
|
223
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zetaben-Html2Feedbooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benoit Larroque
|
@@ -42,6 +42,16 @@ dependencies:
|
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
version: "0.3"
|
44
44
|
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: progressbar
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.0.3
|
54
|
+
version:
|
45
55
|
description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|
46
56
|
email: zeta dot ben at gmail dot com
|
47
57
|
executables:
|
@@ -62,6 +72,7 @@ files:
|
|
62
72
|
- lib/parser.rb
|
63
73
|
has_rdoc: true
|
64
74
|
homepage: http://github.com/Html2Feedbooks
|
75
|
+
licenses:
|
65
76
|
post_install_message:
|
66
77
|
rdoc_options: []
|
67
78
|
|
@@ -82,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
93
|
requirements: []
|
83
94
|
|
84
95
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.
|
96
|
+
rubygems_version: 1.3.5
|
86
97
|
signing_key:
|
87
98
|
specification_version: 2
|
88
99
|
summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|