zetaben-Html2Feedbooks 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +11 -0
- data/bin/html2fb.rb +31 -0
- data/confs/conf.yaml +26 -0
- data/lib/app.rb +49 -0
- data/lib/conf.rb +22 -0
- data/lib/document.rb +56 -0
- data/lib/downloader.rb +24 -0
- data/lib/feedbooks.rb +99 -0
- data/lib/parser.rb +138 -0
- metadata +70 -0
data/README
ADDED
data/bin/html2fb.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
require 'open-uri'
|
3
|
+
require 'conf.rb'
|
4
|
+
require 'downloader.rb'
|
5
|
+
require 'document.rb'
|
6
|
+
require 'parser.rb'
|
7
|
+
require 'feedbooks.rb'
|
8
|
+
|
9
|
+
include HTML2FB
|
10
|
+
|
11
|
+
valid=false
|
12
|
+
entry=ARGV[0]
|
13
|
+
while !valid
|
14
|
+
url=nil
|
15
|
+
begin
|
16
|
+
url=Downloader.valid_url?(entry)
|
17
|
+
valid=true
|
18
|
+
rescue Exception => e
|
19
|
+
STDERR.puts 'Invalid URL' unless entry.nil? || entry==''
|
20
|
+
valid=false
|
21
|
+
puts e
|
22
|
+
end
|
23
|
+
print "URL : " if entry.nil? || entry==''
|
24
|
+
entry=STDIN.readline.strip unless valid
|
25
|
+
end
|
26
|
+
conf=Conf.new('conf.yaml')
|
27
|
+
content=Downloader.download(url)
|
28
|
+
#puts content.size
|
29
|
+
doc=Parser.new(conf).parse(content)
|
30
|
+
puts doc.toc.to_yaml
|
31
|
+
doc.to_feedbooks(conf)
|
data/confs/conf.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
remove:
|
2
|
+
class:
|
3
|
+
- totoc
|
4
|
+
- pagenum
|
5
|
+
- totoi
|
6
|
+
- img
|
7
|
+
expr:
|
8
|
+
- 'table'
|
9
|
+
- //pre
|
10
|
+
between:
|
11
|
+
-
|
12
|
+
- "//body"
|
13
|
+
- "//body/h3[4]"
|
14
|
+
after:
|
15
|
+
- '/html/body/h4[3]'
|
16
|
+
select:
|
17
|
+
expr: h3
|
18
|
+
select:
|
19
|
+
expr: h4
|
20
|
+
|
21
|
+
fb:
|
22
|
+
user: #ask#
|
23
|
+
bookid: #ask#
|
24
|
+
booktype: userbook
|
25
|
+
pass: #ask#
|
26
|
+
host: 'feedbooks.com'
|
data/lib/app.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'net/http'
|
4
|
+
require 'time'
|
5
|
+
|
6
|
+
class AtomPost
|
7
|
+
attr_accessor :title
|
8
|
+
attr_accessor :content
|
9
|
+
attr_accessor :date
|
10
|
+
attr_accessor :author
|
11
|
+
attr_accessor :addr
|
12
|
+
attr_accessor :user
|
13
|
+
attr_accessor :pass
|
14
|
+
|
15
|
+
def initialize(addrs=nil)
|
16
|
+
self.addr=addrs unless addrs.nil?
|
17
|
+
end
|
18
|
+
|
19
|
+
def send
|
20
|
+
raise StandardError.new('Missing Address') if addr.nil?
|
21
|
+
#3: Detailed control
|
22
|
+
url = URI.parse(addr)
|
23
|
+
req = Net::HTTP::Post.new(url.path)
|
24
|
+
req.basic_auth user,pass unless user.nil?
|
25
|
+
|
26
|
+
req.body = '<?xml version="1.0"?>'+"\n"
|
27
|
+
req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
|
28
|
+
req.body +='<title>'+title+'</title>'+"\n"
|
29
|
+
req.body +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
|
30
|
+
req.body +='<updated>'+date.xmlschema+'</updated>'+"\n"
|
31
|
+
req.body +='<author><name>'+author+'</name></author>'+"\n"
|
32
|
+
req.body +='<content>'+content+'</content>'+"\n"
|
33
|
+
req.body +='</entry>'+"\n"
|
34
|
+
|
35
|
+
req.set_content_type('application/atom+xml;type=entry')
|
36
|
+
|
37
|
+
File.open('/tmp/test4.txt','w') do |f|
|
38
|
+
f << req.body
|
39
|
+
end
|
40
|
+
|
41
|
+
res = Net::HTTP.new(url.host, url.port).start {|http| http.request(req) }
|
42
|
+
case res
|
43
|
+
when Net::HTTPSuccess, Net::HTTPRedirection
|
44
|
+
# OK
|
45
|
+
else
|
46
|
+
res.error!
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/conf.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
module HTML2FB
|
3
|
+
class Conf
|
4
|
+
def initialize(file)
|
5
|
+
['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
|
6
|
+
f=p+file
|
7
|
+
begin
|
8
|
+
if File.readable?(f) && File.exists?(f)
|
9
|
+
@conf=File.open(f,'r'){|txt| YAML::load(txt)}
|
10
|
+
return
|
11
|
+
end
|
12
|
+
rescue Exception => e
|
13
|
+
STDERR.puts('unreadable conf : '+f+"\n"+e)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](x)
|
19
|
+
@conf[x]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/document.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
module HTML2FB
|
2
|
+
|
3
|
+
class Section
|
4
|
+
attr_accessor :title
|
5
|
+
attr_accessor :content
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@content=[]
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_html
|
12
|
+
content.collect{|e|e.to_html}.join
|
13
|
+
end
|
14
|
+
|
15
|
+
def titles
|
16
|
+
tit=[]
|
17
|
+
content.each do |f|
|
18
|
+
if f.is_a?Section
|
19
|
+
tit.push f.title
|
20
|
+
else
|
21
|
+
tit.push '#text'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
return [title,tit]
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s
|
29
|
+
return "title :#{title} \n"+content.collect{|a|a.to_s}.join("\n\n")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Document < Section
|
34
|
+
def toc
|
35
|
+
#return content
|
36
|
+
return content.collect{|a|a.titles}
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
class Text
|
42
|
+
attr_accessor :content
|
43
|
+
|
44
|
+
def initialize(c='')
|
45
|
+
@content=c
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_html
|
49
|
+
@content
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_s
|
53
|
+
@content
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/downloader.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
module HTML2FB
|
5
|
+
class Downloader
|
6
|
+
def self.valid_url?(entry)
|
7
|
+
uri=URI.parse(entry)
|
8
|
+
Kernel.open(uri.to_s,'r')
|
9
|
+
return uri
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.download(uri)
|
13
|
+
print "Downloading "
|
14
|
+
puts uri.to_s
|
15
|
+
#tmp=Tempfile.new(uri.gsub(/[^a-z0-9]/,'_'))
|
16
|
+
#tmp.open('w'){|a|
|
17
|
+
# uri.open('r'){|b|
|
18
|
+
# a.write b
|
19
|
+
# }
|
20
|
+
#}
|
21
|
+
Kernel.open(uri.to_s,'r').read
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/feedbooks.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'app.rb'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'digest/md5'
|
4
|
+
|
5
|
+
module HTML2FB
|
6
|
+
|
7
|
+
class FBSession
|
8
|
+
|
9
|
+
attr_accessor :bookid
|
10
|
+
attr_accessor :booktype
|
11
|
+
attr_accessor :user
|
12
|
+
attr_accessor :pass
|
13
|
+
attr_accessor :host
|
14
|
+
@@fbsession=nil
|
15
|
+
def initialize(conf)
|
16
|
+
StandardError.new('Already in session') unless @@fbsession.nil?
|
17
|
+
@@fbsession=self
|
18
|
+
self.bookid=ask(conf['fb']['bookid'],"Book Id")
|
19
|
+
self.booktype=ask(conf['fb']['booktype'],"Book Type")
|
20
|
+
self.user=ask(conf['fb']['user'],"User")
|
21
|
+
self.pass=ask(conf['fb']['pass'],"Pass")
|
22
|
+
self.host=conf['fb']['host']
|
23
|
+
self.host='feedbooks.com' if @host.nil?
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.session
|
27
|
+
return @@fbsession
|
28
|
+
end
|
29
|
+
|
30
|
+
def pass=(pas)
|
31
|
+
|
32
|
+
if pas.gsub(/[^a-z0-9]/,'').size==32
|
33
|
+
@pass=pas
|
34
|
+
else
|
35
|
+
@pass= Digest::MD5.hexdigest(pas)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
class Document
|
42
|
+
def to_feedbooks(conf)
|
43
|
+
FBSession.new(conf)
|
44
|
+
#File.open('/tmp/test3.html','w') do |f|
|
45
|
+
content.each do |e|
|
46
|
+
# f << e.to_feedbooks(conf)
|
47
|
+
e.to_feedbooks(conf)
|
48
|
+
# f << " \n " * 10
|
49
|
+
end
|
50
|
+
#end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Section
|
55
|
+
@@level=0
|
56
|
+
def to_feedbooks(conf)
|
57
|
+
fb=FBSession.session
|
58
|
+
post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
|
59
|
+
doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
|
60
|
+
doc.traverse_all_element do |e|
|
61
|
+
unless e.is_a?Hpricot::Text
|
62
|
+
e.stag.name='xhtml:'+e.stag.name
|
63
|
+
e.etag.name='xhtml:'+e.etag.name unless e.etag.nil?
|
64
|
+
end
|
65
|
+
end
|
66
|
+
post.content=doc.to_html
|
67
|
+
post.user=fb.user
|
68
|
+
post.pass=fb.pass
|
69
|
+
post.date=Time.now
|
70
|
+
post.author=fb.user
|
71
|
+
post.title=title
|
72
|
+
post.send
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :old_to_html :to_html
|
76
|
+
|
77
|
+
def to_html
|
78
|
+
ret=nil
|
79
|
+
@@level+=1
|
80
|
+
if @@level==1
|
81
|
+
ret=old_to_html
|
82
|
+
else
|
83
|
+
ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"+old_to_html
|
84
|
+
end
|
85
|
+
@@level-=1
|
86
|
+
ret
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def ask(txt,disp='Prompt')
|
92
|
+
return txt unless txt.nil? || txt =='#ask#'
|
93
|
+
begin
|
94
|
+
txt=nil
|
95
|
+
print disp+' : '
|
96
|
+
txt=STDIN.readline.strip
|
97
|
+
end while txt.nil? || txt.size==0
|
98
|
+
txt
|
99
|
+
end
|
data/lib/parser.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'document.rb'
|
3
|
+
|
4
|
+
module HTML2FB
|
5
|
+
class Parser
|
6
|
+
|
7
|
+
def initialize(conf)
|
8
|
+
@conf=conf
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse(txt)
|
12
|
+
pdoc=Hpricot(txt)
|
13
|
+
doc=Document.new
|
14
|
+
remove_objs(pdoc)
|
15
|
+
ti=pdoc.at('title')
|
16
|
+
doc.title= ti.inner_text.strip unless ti.nil?
|
17
|
+
# pdoc.search('//h3').each do |e|
|
18
|
+
# doc.content.push(e.inner_text)
|
19
|
+
# end
|
20
|
+
|
21
|
+
parse_text(pdoc,doc)
|
22
|
+
|
23
|
+
return doc
|
24
|
+
end
|
25
|
+
protected
|
26
|
+
|
27
|
+
def remove_objs(doc)
|
28
|
+
if @conf['remove'] then
|
29
|
+
@conf['remove']['class'].each do |cl|
|
30
|
+
doc.search('.'+cl).remove
|
31
|
+
end unless @conf['remove']['class'].nil?
|
32
|
+
@conf['remove']['expr'].each do |cl|
|
33
|
+
doc.search(cl).remove
|
34
|
+
end unless @conf['remove']['expr'].nil?
|
35
|
+
@conf['remove']['before'].each do |cl|
|
36
|
+
x=doc.at(cl)
|
37
|
+
x.preceding.remove
|
38
|
+
x.parent.children.delete(x)
|
39
|
+
end unless @conf['remove']['before'].nil?
|
40
|
+
@conf['remove']['between'].each do |cl|
|
41
|
+
# puts "between "+cl.inspect
|
42
|
+
doc.between(cl.first,cl.last).remove
|
43
|
+
end unless @conf['remove']['between'].nil?
|
44
|
+
@conf['remove']['after'].each do |cl|
|
45
|
+
x=doc.at(cl)
|
46
|
+
x.following.remove
|
47
|
+
x.parent.children.delete(x)
|
48
|
+
end unless @conf['remove']['after'].nil?
|
49
|
+
end
|
50
|
+
File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_text(doc,ret)
|
54
|
+
ti = doc.search('//'+@conf['select']['expr'])
|
55
|
+
tit = ti.zip ti[1..-1]+[nil]
|
56
|
+
|
57
|
+
tit.each do |a|
|
58
|
+
s=Section.new
|
59
|
+
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
|
60
|
+
tmp.sub!(a.first.to_original_html,'')
|
61
|
+
s.content =[Text.new(tmp)]
|
62
|
+
s.title = a.first.inner_text.to_s
|
63
|
+
ret.content.push s
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
if @conf['select']['select']
|
68
|
+
conf=@conf['select']
|
69
|
+
parse_rec(ret,conf)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
protected
|
74
|
+
|
75
|
+
def parse_rec(el,conf)
|
76
|
+
return if conf.nil?
|
77
|
+
if el.is_a?Section
|
78
|
+
el.content.each do |l|
|
79
|
+
if l.is_a?Section
|
80
|
+
parse_rec(l,conf['select'])
|
81
|
+
else
|
82
|
+
doc=Hpricot(l.content)
|
83
|
+
ti = doc.search('//'+conf['expr'])
|
84
|
+
return if ti.size ==0
|
85
|
+
tit = ti.zip ti[1..-1]+[nil]
|
86
|
+
|
87
|
+
tit.each do |a|
|
88
|
+
s=Section.new
|
89
|
+
tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
|
90
|
+
s.content = [Text.new(tmp)]
|
91
|
+
s.title = a.first.inner_text.to_s
|
92
|
+
el.content.push s
|
93
|
+
l.content.sub!(tmp,'')
|
94
|
+
l.content.sub!(a.first.to_original_html,'')
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
class String
|
106
|
+
def blank?
|
107
|
+
self==""
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class NilClass
|
112
|
+
def blank?
|
113
|
+
true
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
module Hpricot::Traverse
|
118
|
+
def between(i,j)
|
119
|
+
#puts i,j
|
120
|
+
unless j.nil?
|
121
|
+
prec=self.at(i).preceding
|
122
|
+
Hpricot::Elements[*self.at(j).preceding.find_all{|el| !prec.include?el}]
|
123
|
+
else
|
124
|
+
self.at(i).following
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
class Hpricot::Elements
|
131
|
+
def between(i,j)
|
132
|
+
Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
|
133
|
+
end
|
134
|
+
|
135
|
+
def -(a)
|
136
|
+
Hpricot::Elements[*self.find_all{|el| !a.include?el}]
|
137
|
+
end
|
138
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zetaben-Html2Feedbooks
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.1"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Benoit Larroque
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-28 00:00:00 -07:00
|
13
|
+
default_executable: html2fb.rb
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.6"
|
24
|
+
version:
|
25
|
+
description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|
26
|
+
email: zeta dot ben at gmail dot com
|
27
|
+
executables:
|
28
|
+
- html2fb.rb
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- README
|
35
|
+
- confs/conf.yaml
|
36
|
+
- lib/app.rb
|
37
|
+
- lib/conf.rb
|
38
|
+
- lib/document.rb
|
39
|
+
- lib/downloader.rb
|
40
|
+
- lib/feedbooks.rb
|
41
|
+
- bin/html2fb.rb
|
42
|
+
- lib/parser.rb
|
43
|
+
has_rdoc: true
|
44
|
+
homepage: http://github.com/Html2Feedbooks
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
version:
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project:
|
65
|
+
rubygems_version: 1.2.0
|
66
|
+
signing_key:
|
67
|
+
specification_version: 2
|
68
|
+
summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com
|
69
|
+
test_files: []
|
70
|
+
|