zetaben-Html2Feedbooks 0.4.4 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/confs/conf.yaml +5 -5
- data/lib/document.rb +12 -6
- data/lib/feedbooks.rb +4 -5
- data/lib/parser.rb +128 -78
- metadata +13 -2
data/confs/conf.yaml
CHANGED
data/lib/document.rb
CHANGED
@@ -24,11 +24,11 @@ module HTML2FB
|
|
24
24
|
def titles
|
25
25
|
tit=[]
|
26
26
|
content.each do |f|
|
27
|
-
if f.is_a?Section
|
28
|
-
tit.push f.
|
29
|
-
else
|
30
|
-
tit.push '#text'
|
31
|
-
end
|
27
|
+
# if f.is_a?Section
|
28
|
+
tit.push f.titles
|
29
|
+
# else
|
30
|
+
# tit.push '#text'
|
31
|
+
# end
|
32
32
|
end
|
33
33
|
|
34
34
|
return [decorated_title,tit]
|
@@ -42,7 +42,9 @@ module HTML2FB
|
|
42
42
|
class Document < Section
|
43
43
|
def toc
|
44
44
|
#return content
|
45
|
-
return content.collect{|a|
|
45
|
+
return content.collect{|a|
|
46
|
+
a.titles
|
47
|
+
}
|
46
48
|
end
|
47
49
|
|
48
50
|
end
|
@@ -61,5 +63,9 @@ module HTML2FB
|
|
61
63
|
def to_s
|
62
64
|
@content
|
63
65
|
end
|
66
|
+
|
67
|
+
def titles
|
68
|
+
return ['#text']
|
69
|
+
end
|
64
70
|
end
|
65
71
|
end
|
data/lib/feedbooks.rb
CHANGED
@@ -93,11 +93,10 @@ module HTML2FB
|
|
93
93
|
|
94
94
|
def to_html
|
95
95
|
ret=nil
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
end
|
96
|
+
ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"
|
97
|
+
@@level+=1
|
98
|
+
ret+=old_to_html
|
99
|
+
@@level-=1
|
101
100
|
ret
|
102
101
|
end
|
103
102
|
end
|
data/lib/parser.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require 'hpricot'
|
2
2
|
require 'document.rb'
|
3
|
+
require 'progressbar'
|
4
|
+
#require 'term/ansicolor'
|
5
|
+
#include Term::ANSIColor
|
3
6
|
|
4
7
|
module HTML2FB
|
5
8
|
class Parser
|
@@ -8,14 +11,6 @@ module HTML2FB
|
|
8
11
|
@conf=conf
|
9
12
|
end
|
10
13
|
|
11
|
-
def extract_text(n)
|
12
|
-
t=''
|
13
|
-
n.traverse_all_element do |e|
|
14
|
-
t+=e.content.to_s if e.is_a?(Hpricot::Text)
|
15
|
-
end
|
16
|
-
t
|
17
|
-
end
|
18
|
-
|
19
14
|
def parse(txt)
|
20
15
|
puts "Parsing HTML"
|
21
16
|
pdoc=Hpricot(txt)
|
@@ -23,7 +18,7 @@ module HTML2FB
|
|
23
18
|
puts "Removing garbage elements"
|
24
19
|
remove_objs(pdoc)
|
25
20
|
ti=pdoc.at('title')
|
26
|
-
doc.title=
|
21
|
+
doc.title= ti.extract_text.strip unless ti.nil?
|
27
22
|
# pdoc.search('//h3').each do |e|
|
28
23
|
# doc.content.push(e.inner_text)
|
29
24
|
# end
|
@@ -31,6 +26,8 @@ module HTML2FB
|
|
31
26
|
puts "Building TOC"
|
32
27
|
parse_text(pdoc,doc)
|
33
28
|
|
29
|
+
# puts green(bold(doc.pretty_inspect))
|
30
|
+
|
34
31
|
return doc
|
35
32
|
end
|
36
33
|
protected
|
@@ -66,60 +63,119 @@ module HTML2FB
|
|
66
63
|
end
|
67
64
|
|
68
65
|
def parse_text(doc,ret)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
66
|
+
aut=build_autom(@conf['select'],ret)
|
67
|
+
|
68
|
+
pbar = ProgressBar.new("Parsing", doc.search('//').size)
|
69
|
+
doc.traverse_all_element do |el|
|
70
|
+
aut.feed(el)
|
71
|
+
pbar.inc
|
73
72
|
end
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
73
|
+
pbar.finish
|
74
|
+
aut.finish(doc)
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def build_autom(conf_tab,doc)
|
80
|
+
mach=StateMachine.new
|
81
|
+
build_rec(mach,conf_tab)
|
82
|
+
mach.reset(doc)
|
83
|
+
mach
|
84
|
+
end
|
85
|
+
|
86
|
+
def build_rec(mach,conf_tab)
|
87
|
+
return if conf_tab.size < 1
|
88
|
+
exprs=conf_tab.collect{|e| e.reject{|k,v| k=='select'} }
|
89
|
+
mach.add_level(exprs)
|
90
|
+
build_rec(mach,conf_tab.collect{|e| e['select'] }.flatten.reject{|a|a.nil?})
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class StateMachine
|
95
|
+
|
96
|
+
def initialize
|
97
|
+
@levels=[]
|
98
|
+
@current_level=0
|
99
|
+
@starts=[]
|
100
|
+
@done=[]
|
101
|
+
@max_level=0
|
102
|
+
@content=nil
|
103
|
+
end
|
104
|
+
|
105
|
+
def add_level(tab)
|
106
|
+
tab=[tab] unless tab.is_a?Array
|
107
|
+
@levels.push tab
|
108
|
+
@current_level+=1
|
109
|
+
end
|
110
|
+
|
111
|
+
def reset(doc)
|
112
|
+
@current_level=0
|
113
|
+
@max_level=@levels.size
|
114
|
+
@starts[0]=doc
|
115
|
+
@content='body'
|
116
|
+
end
|
117
|
+
|
118
|
+
def inspect
|
119
|
+
@levels.inspect+"\n"+@current_level.to_s+"\n\n"+@done.inspect
|
120
|
+
end
|
121
|
+
|
122
|
+
def create_fbsection(title,fblevel)
|
123
|
+
s=Section.new
|
124
|
+
s.fblevel=fblevel
|
125
|
+
s.title = title
|
126
|
+
s
|
127
|
+
end
|
128
|
+
|
129
|
+
def create_textNode(txt)
|
130
|
+
Text.new(txt)
|
131
|
+
end
|
132
|
+
|
133
|
+
def finish(doc)
|
134
|
+
unless @content.nil?
|
135
|
+
# t=create_textNode(doc.root.search(@content...doc.children.last.xpath))
|
136
|
+
t=create_textNode(doc.at(@content).following.to_html)
|
137
|
+
@starts[@current_level].content.push(t)
|
86
138
|
end
|
139
|
+
(1..@max_level).to_a.reverse.each do |l|
|
140
|
+
close_section(l)
|
141
|
+
end
|
142
|
+
@starts[0]
|
143
|
+
end
|
87
144
|
|
88
|
-
|
89
|
-
|
90
|
-
|
145
|
+
def open_section(obj,lvl,el)
|
146
|
+
# if @current_level < lvl
|
147
|
+
t=create_textNode((el.root.search(@content...(el.xpath))[1..-1].to_html))
|
148
|
+
@starts[@current_level].content.push(t)
|
149
|
+
# end
|
150
|
+
(lvl..@max_level).to_a.reverse.each do |l|
|
151
|
+
close_section(l)
|
91
152
|
end
|
153
|
+
@starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
|
154
|
+
@content=obj[:xpath]
|
155
|
+
@current_level=lvl
|
92
156
|
end
|
93
157
|
|
94
|
-
|
158
|
+
def close_section(lvl)
|
159
|
+
return if @starts[lvl].nil?
|
160
|
+
@starts[lvl-1].content.push @starts[lvl]
|
161
|
+
@starts[lvl]=nil
|
162
|
+
end
|
163
|
+
|
164
|
+
def feed(el)
|
165
|
+
return if el.is_a?Hpricot::Text
|
166
|
+
@done=[[]*@levels.size]
|
95
167
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
if l.is_a?Section
|
101
|
-
parse_rec(l,conf['select'])
|
102
|
-
else
|
103
|
-
doc=Hpricot(l.content)
|
104
|
-
ti = doc.search('//'+conf['expr'])
|
105
|
-
return if ti.size ==0
|
106
|
-
tit = ti.zip ti[1..-1]+[nil]
|
107
|
-
|
108
|
-
tit.each do |a|
|
109
|
-
s=Section.new
|
110
|
-
s.fblevel=conf['fblevel']
|
111
|
-
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).collect{|r| r.to_original_html}
|
112
|
-
|
113
|
-
s.content = [Text.new(tmp.join)]
|
114
|
-
s.title = extract_text(a.first)
|
115
|
-
el.content.push s
|
116
|
-
tmp.each{|t|l.content.sub!(t,'')}
|
117
|
-
l.content.sub!(a.first.to_original_html,'')
|
118
|
-
end
|
168
|
+
@levels.each_with_index do |lvl,i|
|
169
|
+
lvl.each do |expr|
|
170
|
+
#puts i.to_s+" "+el.inspect if el.in_search?(expr['expr'])
|
171
|
+
if el.in_search?(expr['expr'])
|
119
172
|
|
173
|
+
|
174
|
+
open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
|
120
175
|
end
|
121
176
|
end
|
122
177
|
end
|
178
|
+
|
123
179
|
end
|
124
180
|
end
|
125
181
|
end
|
@@ -138,36 +194,30 @@ class NilClass
|
|
138
194
|
end
|
139
195
|
|
140
196
|
module Hpricot::Traverse
|
141
|
-
def
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
|
146
|
-
else
|
147
|
-
self.at(i).deep_following unless self.at(i).nil?
|
197
|
+
def in_search?(expr)
|
198
|
+
se_in=self.parent
|
199
|
+
if expr[0..1]=='/'
|
200
|
+
se_in=se_in.parent until se_in.parent.nil?
|
148
201
|
end
|
202
|
+
se_in.search(expr).each do |el|
|
203
|
+
return true if el==self
|
204
|
+
end
|
205
|
+
# puts self.name+" "+expr
|
206
|
+
return false
|
149
207
|
end
|
150
208
|
|
151
|
-
def
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
Hpricot::Elements[*ret]
|
156
|
-
end
|
157
|
-
def deep_following()
|
158
|
-
ret=following
|
159
|
-
ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
|
160
|
-
Hpricot::Elements[*ret]
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
|
165
|
-
class Hpricot::Elements
|
166
|
-
def between(i,j)
|
167
|
-
Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
|
209
|
+
def root
|
210
|
+
se_in=self
|
211
|
+
se_in=se_in.parent until se_in.parent.nil?
|
212
|
+
se_in
|
168
213
|
end
|
169
214
|
|
170
|
-
def
|
171
|
-
|
215
|
+
def extract_text
|
216
|
+
t=''
|
217
|
+
self.traverse_all_element do |e|
|
218
|
+
t+=e.content.to_s if e.is_a?(Hpricot::Text)
|
219
|
+
end
|
220
|
+
t
|
172
221
|
end
|
173
222
|
end
|
223
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zetaben-Html2Feedbooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benoit Larroque
|
@@ -42,6 +42,16 @@ dependencies:
|
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
version: "0.3"
|
44
44
|
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: progressbar
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.0.3
|
54
|
+
version:
|
45
55
|
description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|
46
56
|
email: zeta dot ben at gmail dot com
|
47
57
|
executables:
|
@@ -62,6 +72,7 @@ files:
|
|
62
72
|
- lib/parser.rb
|
63
73
|
has_rdoc: true
|
64
74
|
homepage: http://github.com/Html2Feedbooks
|
75
|
+
licenses:
|
65
76
|
post_install_message:
|
66
77
|
rdoc_options: []
|
67
78
|
|
@@ -82,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
93
|
requirements: []
|
83
94
|
|
84
95
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.
|
96
|
+
rubygems_version: 1.3.5
|
86
97
|
signing_key:
|
87
98
|
specification_version: 2
|
88
99
|
summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|