zetaben-Html2Feedbooks 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +3 -1
- data/bin/html2fb.rb +23 -1
- data/lib/app.rb +14 -2
- data/lib/conf.rb +1 -1
- data/lib/feedbooks.rb +1 -0
- data/lib/parser.rb +36 -8
- metadata +11 -1
data/README
CHANGED
data/bin/html2fb.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
require 'optparse'
|
2
3
|
require 'open-uri'
|
3
4
|
require 'conf.rb'
|
4
5
|
require 'downloader.rb'
|
@@ -8,6 +9,20 @@ require 'feedbooks.rb'
|
|
8
9
|
|
9
10
|
include HTML2FB
|
10
11
|
|
12
|
+
options = {}
|
13
|
+
options[:conf] = "conf.yaml"
|
14
|
+
options[:preview] = true
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: html2fb [options] URL"
|
17
|
+
|
18
|
+
opts.on("-c", "--conf FILE", String,"Configuration file") do |f|
|
19
|
+
options[:conf] = f
|
20
|
+
end
|
21
|
+
opts.on("-s", "-s","Send to feedbooks") do |f|
|
22
|
+
options[:preview] = !f
|
23
|
+
end
|
24
|
+
end.parse!
|
25
|
+
|
11
26
|
valid=false
|
12
27
|
entry=ARGV[0]
|
13
28
|
while !valid
|
@@ -23,9 +38,16 @@ while !valid
|
|
23
38
|
print "URL : " if entry.nil? || entry==''
|
24
39
|
entry=STDIN.readline.strip unless valid
|
25
40
|
end
|
26
|
-
conf=Conf.new(
|
41
|
+
conf=Conf.new(options[:conf])
|
27
42
|
content=Downloader.download(url)
|
28
43
|
#puts content.size
|
29
44
|
doc=Parser.new(conf).parse(content)
|
30
45
|
puts doc.toc.to_yaml
|
46
|
+
if options[:preview]
|
47
|
+
f=File.open('/tmp/plop.html','w')
|
48
|
+
f.write doc.to_html
|
49
|
+
f.close
|
50
|
+
`firefox /tmp/plop.html`
|
51
|
+
else
|
31
52
|
doc.to_feedbooks(conf)
|
53
|
+
end
|
data/lib/app.rb
CHANGED
@@ -2,6 +2,7 @@ require 'digest/md5'
|
|
2
2
|
require 'open-uri'
|
3
3
|
require 'net/http'
|
4
4
|
require 'time'
|
5
|
+
require 'htmlentities'
|
5
6
|
|
6
7
|
class AtomPost
|
7
8
|
attr_accessor :title
|
@@ -25,11 +26,11 @@ class AtomPost
|
|
25
26
|
|
26
27
|
req.body = '<?xml version="1.0"?>'+"\n"
|
27
28
|
req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
|
28
|
-
req.body +='<title>'+title+'</title>'+"\n"
|
29
|
+
req.body +='<title>'+recode_text(title)+'</title>'+"\n"
|
29
30
|
req.body +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
|
30
31
|
req.body +='<updated>'+date.xmlschema+'</updated>'+"\n"
|
31
32
|
req.body +='<author><name>'+author+'</name></author>'+"\n"
|
32
|
-
req.body +='<content>'+content+'</content>'+"\n"
|
33
|
+
req.body +='<content>'+recode_text(content)+'</content>'+"\n"
|
33
34
|
req.body +='</entry>'+"\n"
|
34
35
|
|
35
36
|
req.set_content_type('application/atom+xml;type=entry')
|
@@ -46,4 +47,15 @@ class AtomPost
|
|
46
47
|
res.error!
|
47
48
|
end
|
48
49
|
end
|
50
|
+
|
51
|
+
def recode_text(txt)
|
52
|
+
return txt if txt.blank?
|
53
|
+
m=Hpricot(txt)
|
54
|
+
m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
|
55
|
+
m.to_html
|
56
|
+
end
|
57
|
+
HTMLENCODER=HTMLEntities.new
|
58
|
+
def force_decimal_entities(txt)
|
59
|
+
HTMLENCODER.encode(HTMLENCODER.decode(txt),:decimal)
|
60
|
+
end
|
49
61
|
end
|
data/lib/conf.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
module HTML2FB
|
3
3
|
class Conf
|
4
4
|
def initialize(file)
|
5
|
-
['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
|
5
|
+
['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
|
6
6
|
f=p+file
|
7
7
|
begin
|
8
8
|
if File.readable?(f) && File.exists?(f)
|
data/lib/feedbooks.rb
CHANGED
@@ -54,6 +54,7 @@ module HTML2FB
|
|
54
54
|
class Section
|
55
55
|
@@level=0
|
56
56
|
def to_feedbooks(conf)
|
57
|
+
puts "Sending to feedbooks"
|
57
58
|
fb=FBSession.session
|
58
59
|
post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
|
59
60
|
doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
|
data/lib/parser.rb
CHANGED
@@ -8,16 +8,27 @@ module HTML2FB
|
|
8
8
|
@conf=conf
|
9
9
|
end
|
10
10
|
|
11
|
+
def extract_text(n)
|
12
|
+
t=''
|
13
|
+
n.traverse_all_element do |e|
|
14
|
+
t+=e.content.to_s if e.is_a?(Hpricot::Text)
|
15
|
+
end
|
16
|
+
t
|
17
|
+
end
|
18
|
+
|
11
19
|
def parse(txt)
|
20
|
+
puts "Parsing HTML"
|
12
21
|
pdoc=Hpricot(txt)
|
13
22
|
doc=Document.new
|
23
|
+
puts "Removing garbage elements"
|
14
24
|
remove_objs(pdoc)
|
15
25
|
ti=pdoc.at('title')
|
16
|
-
doc.title= ti.
|
26
|
+
doc.title= extract_text(ti).strip unless ti.nil?
|
17
27
|
# pdoc.search('//h3').each do |e|
|
18
28
|
# doc.content.push(e.inner_text)
|
19
29
|
# end
|
20
30
|
|
31
|
+
puts "Building TOC"
|
21
32
|
parse_text(pdoc,doc)
|
22
33
|
|
23
34
|
return doc
|
@@ -34,8 +45,10 @@ module HTML2FB
|
|
34
45
|
end unless @conf['remove']['expr'].nil?
|
35
46
|
@conf['remove']['before'].each do |cl|
|
36
47
|
x=doc.at(cl)
|
48
|
+
if x
|
37
49
|
x.preceding.remove
|
38
50
|
x.parent.children.delete(x)
|
51
|
+
end
|
39
52
|
end unless @conf['remove']['before'].nil?
|
40
53
|
@conf['remove']['between'].each do |cl|
|
41
54
|
# puts "between "+cl.inspect
|
@@ -43,11 +56,13 @@ module HTML2FB
|
|
43
56
|
end unless @conf['remove']['between'].nil?
|
44
57
|
@conf['remove']['after'].each do |cl|
|
45
58
|
x=doc.at(cl)
|
59
|
+
if x
|
46
60
|
x.following.remove
|
47
61
|
x.parent.children.delete(x)
|
62
|
+
end
|
48
63
|
end unless @conf['remove']['after'].nil?
|
49
64
|
end
|
50
|
-
File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
|
65
|
+
# File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
|
51
66
|
end
|
52
67
|
|
53
68
|
def parse_text(doc,ret)
|
@@ -59,7 +74,8 @@ module HTML2FB
|
|
59
74
|
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
|
60
75
|
tmp.sub!(a.first.to_original_html,'')
|
61
76
|
s.content =[Text.new(tmp)]
|
62
|
-
|
77
|
+
#buggy with entities
|
78
|
+
s.title = extract_text(a.first)
|
63
79
|
ret.content.push s
|
64
80
|
|
65
81
|
end
|
@@ -88,7 +104,7 @@ module HTML2FB
|
|
88
104
|
s=Section.new
|
89
105
|
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
|
90
106
|
s.content = [Text.new(tmp)]
|
91
|
-
s.title = a.first
|
107
|
+
s.title = extract_text(a.first)
|
92
108
|
el.content.push s
|
93
109
|
l.content.sub!(tmp,'')
|
94
110
|
l.content.sub!(a.first.to_original_html,'')
|
@@ -117,13 +133,25 @@ end
|
|
117
133
|
module Hpricot::Traverse
|
118
134
|
def between(i,j)
|
119
135
|
#puts i,j
|
120
|
-
unless j.nil?
|
121
|
-
prec=self.at(i).
|
122
|
-
Hpricot::Elements[*self.at(j).
|
136
|
+
unless j.nil? || self.at(j).nil?
|
137
|
+
prec=self.at(i).deep_preceding
|
138
|
+
Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
|
123
139
|
else
|
124
|
-
self.at(i).
|
140
|
+
self.at(i).deep_following unless self.at(i).nil?
|
125
141
|
end
|
126
142
|
end
|
143
|
+
|
144
|
+
def deep_preceding()
|
145
|
+
ret=Hpricot::Elements[]
|
146
|
+
ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
|
147
|
+
ret+=preceding
|
148
|
+
Hpricot::Elements[*ret]
|
149
|
+
end
|
150
|
+
def deep_following()
|
151
|
+
ret=following
|
152
|
+
ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
|
153
|
+
Hpricot::Elements[*ret]
|
154
|
+
end
|
127
155
|
end
|
128
156
|
|
129
157
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zetaben-Html2Feedbooks
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: "0.
|
4
|
+
version: "0.2"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benoit Larroque
|
@@ -22,6 +22,16 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0.6"
|
24
24
|
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: htmlentities
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "4.0"
|
34
|
+
version:
|
25
35
|
description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|
26
36
|
email: zeta dot ben at gmail dot com
|
27
37
|
executables:
|