zetaben-Html2Feedbooks 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +3 -1
- data/bin/html2fb.rb +23 -1
- data/lib/app.rb +14 -2
- data/lib/conf.rb +1 -1
- data/lib/feedbooks.rb +1 -0
- data/lib/parser.rb +36 -8
- metadata +11 -1
data/README
CHANGED
data/bin/html2fb.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
require 'optparse'
|
2
3
|
require 'open-uri'
|
3
4
|
require 'conf.rb'
|
4
5
|
require 'downloader.rb'
|
@@ -8,6 +9,20 @@ require 'feedbooks.rb'
|
|
8
9
|
|
9
10
|
include HTML2FB
|
10
11
|
|
12
|
+
options = {}
|
13
|
+
options[:conf] = "conf.yaml"
|
14
|
+
options[:preview] = true
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: html2fb [options] URL"
|
17
|
+
|
18
|
+
opts.on("-c", "--conf FILE", String,"Configuration file") do |f|
|
19
|
+
options[:conf] = f
|
20
|
+
end
|
21
|
+
opts.on("-s", "-s","Send to feedbooks") do |f|
|
22
|
+
options[:preview] = !f
|
23
|
+
end
|
24
|
+
end.parse!
|
25
|
+
|
11
26
|
valid=false
|
12
27
|
entry=ARGV[0]
|
13
28
|
while !valid
|
@@ -23,9 +38,16 @@ while !valid
|
|
23
38
|
print "URL : " if entry.nil? || entry==''
|
24
39
|
entry=STDIN.readline.strip unless valid
|
25
40
|
end
|
26
|
-
conf=Conf.new(
|
41
|
+
conf=Conf.new(options[:conf])
|
27
42
|
content=Downloader.download(url)
|
28
43
|
#puts content.size
|
29
44
|
doc=Parser.new(conf).parse(content)
|
30
45
|
puts doc.toc.to_yaml
|
46
|
+
if options[:preview]
|
47
|
+
f=File.open('/tmp/plop.html','w')
|
48
|
+
f.write doc.to_html
|
49
|
+
f.close
|
50
|
+
`firefox /tmp/plop.html`
|
51
|
+
else
|
31
52
|
doc.to_feedbooks(conf)
|
53
|
+
end
|
data/lib/app.rb
CHANGED
@@ -2,6 +2,7 @@ require 'digest/md5'
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'time'
|
5
|
+
require 'htmlentities'
|
5
6
|
|
6
7
|
class AtomPost
|
7
8
|
attr_accessor :title
|
@@ -25,11 +26,11 @@ class AtomPost
|
|
25
26
|
|
26
27
|
req.body = '<?xml version="1.0"?>'+"\n"
|
27
28
|
req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
|
28
|
-
req.body +='<title>'+title+'</title>'+"\n"
|
29
|
+
req.body +='<title>'+recode_text(title)+'</title>'+"\n"
|
29
30
|
req.body +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
|
30
31
|
req.body +='<updated>'+date.xmlschema+'</updated>'+"\n"
|
31
32
|
req.body +='<author><name>'+author+'</name></author>'+"\n"
|
32
|
-
req.body +='<content>'+content+'</content>'+"\n"
|
33
|
+
req.body +='<content>'+recode_text(content)+'</content>'+"\n"
|
33
34
|
req.body +='</entry>'+"\n"
|
34
35
|
|
35
36
|
req.set_content_type('application/atom+xml;type=entry')
|
@@ -46,4 +47,15 @@ class AtomPost
|
|
46
47
|
res.error!
|
47
48
|
end
|
48
49
|
end
|
50
|
+
|
51
|
+
def recode_text(txt)
|
52
|
+
return txt if txt.blank?
|
53
|
+
m=Hpricot(txt)
|
54
|
+
m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
|
55
|
+
m.to_html
|
56
|
+
end
|
57
|
+
HTMLENCODER=HTMLEntities.new
|
58
|
+
def force_decimal_entities(txt)
|
59
|
+
HTMLENCODER.encode(HTMLENCODER.decode(txt),:decimal)
|
60
|
+
end
|
49
61
|
end
|
data/lib/conf.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
module HTML2FB
|
3
3
|
class Conf
|
4
4
|
def initialize(file)
|
5
|
-
['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
|
5
|
+
['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
|
6
6
|
f=p+file
|
7
7
|
begin
|
8
8
|
if File.readable?(f) && File.exists?(f)
|
data/lib/feedbooks.rb
CHANGED
@@ -54,6 +54,7 @@ module HTML2FB
|
|
54
54
|
class Section
|
55
55
|
@@level=0
|
56
56
|
def to_feedbooks(conf)
|
57
|
+
puts "Sending to feedbooks"
|
57
58
|
fb=FBSession.session
|
58
59
|
post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
|
59
60
|
doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
|
data/lib/parser.rb
CHANGED
@@ -8,16 +8,27 @@ module HTML2FB
|
|
8
8
|
@conf=conf
|
9
9
|
end
|
10
10
|
|
11
|
+
def extract_text(n)
|
12
|
+
t=''
|
13
|
+
n.traverse_all_element do |e|
|
14
|
+
t+=e.content.to_s if e.is_a?(Hpricot::Text)
|
15
|
+
end
|
16
|
+
t
|
17
|
+
end
|
18
|
+
|
11
19
|
def parse(txt)
|
20
|
+
puts "Parsing HTML"
|
12
21
|
pdoc=Hpricot(txt)
|
13
22
|
doc=Document.new
|
23
|
+
puts "Removing garbage elements"
|
14
24
|
remove_objs(pdoc)
|
15
25
|
ti=pdoc.at('title')
|
16
|
-
doc.title= ti.
|
26
|
+
doc.title= extract_text(ti).strip unless ti.nil?
|
17
27
|
# pdoc.search('//h3').each do |e|
|
18
28
|
# doc.content.push(e.inner_text)
|
19
29
|
# end
|
20
30
|
|
31
|
+
puts "Building TOC"
|
21
32
|
parse_text(pdoc,doc)
|
22
33
|
|
23
34
|
return doc
|
@@ -34,8 +45,10 @@ module HTML2FB
|
|
34
45
|
end unless @conf['remove']['expr'].nil?
|
35
46
|
@conf['remove']['before'].each do |cl|
|
36
47
|
x=doc.at(cl)
|
48
|
+
if x
|
37
49
|
x.preceding.remove
|
38
50
|
x.parent.children.delete(x)
|
51
|
+
end
|
39
52
|
end unless @conf['remove']['before'].nil?
|
40
53
|
@conf['remove']['between'].each do |cl|
|
41
54
|
# puts "between "+cl.inspect
|
@@ -43,11 +56,13 @@ module HTML2FB
|
|
43
56
|
end unless @conf['remove']['between'].nil?
|
44
57
|
@conf['remove']['after'].each do |cl|
|
45
58
|
x=doc.at(cl)
|
59
|
+
if x
|
46
60
|
x.following.remove
|
47
61
|
x.parent.children.delete(x)
|
62
|
+
end
|
48
63
|
end unless @conf['remove']['after'].nil?
|
49
64
|
end
|
50
|
-
File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
|
65
|
+
# File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
|
51
66
|
end
|
52
67
|
|
53
68
|
def parse_text(doc,ret)
|
@@ -59,7 +74,8 @@ module HTML2FB
|
|
59
74
|
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
|
60
75
|
tmp.sub!(a.first.to_original_html,'')
|
61
76
|
s.content =[Text.new(tmp)]
|
62
|
-
|
77
|
+
#buggy with entities
|
78
|
+
s.title = extract_text(a.first)
|
63
79
|
ret.content.push s
|
64
80
|
|
65
81
|
end
|
@@ -88,7 +104,7 @@ module HTML2FB
|
|
88
104
|
s=Section.new
|
89
105
|
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
|
90
106
|
s.content = [Text.new(tmp)]
|
91
|
-
s.title = a.first
|
107
|
+
s.title = extract_text(a.first)
|
92
108
|
el.content.push s
|
93
109
|
l.content.sub!(tmp,'')
|
94
110
|
l.content.sub!(a.first.to_original_html,'')
|
@@ -117,13 +133,25 @@ end
|
|
117
133
|
module Hpricot::Traverse
|
118
134
|
def between(i,j)
|
119
135
|
#puts i,j
|
120
|
-
unless j.nil?
|
121
|
-
prec=self.at(i).
|
122
|
-
Hpricot::Elements[*self.at(j).
|
136
|
+
unless j.nil? || self.at(j).nil?
|
137
|
+
prec=self.at(i).deep_preceding
|
138
|
+
Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
|
123
139
|
else
|
124
|
-
self.at(i).
|
140
|
+
self.at(i).deep_following unless self.at(i).nil?
|
125
141
|
end
|
126
142
|
end
|
143
|
+
|
144
|
+
def deep_preceding()
|
145
|
+
ret=Hpricot::Elements[]
|
146
|
+
ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
|
147
|
+
ret+=preceding
|
148
|
+
Hpricot::Elements[*ret]
|
149
|
+
end
|
150
|
+
def deep_following()
|
151
|
+
ret=following
|
152
|
+
ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
|
153
|
+
Hpricot::Elements[*ret]
|
154
|
+
end
|
127
155
|
end
|
128
156
|
|
129
157
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zetaben-Html2Feedbooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benoit Larroque
|
@@ -22,6 +22,16 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0.6"
|
24
24
|
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: htmlentities
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "4.0"
|
34
|
+
version:
|
25
35
|
description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|
26
36
|
email: zeta dot ben at gmail dot com
|
27
37
|
executables:
|