zetaben-Html2Feedbooks 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,11 @@
1
+ HTML2Feedbooks
2
+ ==============
3
+
4
+ A script to automate basic publishing work on Feedbooks.com.
5
+
6
+ Usage
7
+ -----
8
+
9
+ ./html2fb URL.html
10
+
11
+ You can change settings in confs/conf.yaml
data/bin/html2fb.rb ADDED
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/ruby
2
+ require 'open-uri'
3
+ require 'conf.rb'
4
+ require 'downloader.rb'
5
+ require 'document.rb'
6
+ require 'parser.rb'
7
+ require 'feedbooks.rb'
8
+
9
+ include HTML2FB
10
+
11
+ valid=false
12
+ entry=ARGV[0]
13
+ while !valid
14
+ url=nil
15
+ begin
16
+ url=Downloader.valid_url?(entry)
17
+ valid=true
18
+ rescue Exception => e
19
+ STDERR.puts 'Invalid URL' unless entry.nil? || entry==''
20
+ valid=false
21
+ puts e
22
+ end
23
+ print "URL : " if entry.nil? || entry==''
24
+ entry=STDIN.readline.strip unless valid
25
+ end
26
+ conf=Conf.new('conf.yaml')
27
+ content=Downloader.download(url)
28
+ #puts content.size
29
+ doc=Parser.new(conf).parse(content)
30
+ puts doc.toc.to_yaml
31
+ doc.to_feedbooks(conf)
data/confs/conf.yaml ADDED
@@ -0,0 +1,26 @@
1
+ remove:
2
+ class:
3
+ - totoc
4
+ - pagenum
5
+ - totoi
6
+ - img
7
+ expr:
8
+ - 'table'
9
+ - //pre
10
+ between:
11
+ -
12
+ - "//body"
13
+ - "//body/h3[4]"
14
+ after:
15
+ - '/html/body/h4[3]'
16
+ select:
17
+ expr: h3
18
+ select:
19
+ expr: h4
20
+
21
+ fb:
22
+ user: #ask#
23
+ bookid: #ask#
24
+ booktype: userbook
25
+ pass: #ask#
26
+ host: 'feedbooks.com'
data/lib/app.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'digest/md5'
2
+ require 'open-uri'
3
+ require 'net/http'
4
+ require 'time'
5
+
6
+ class AtomPost
7
+ attr_accessor :title
8
+ attr_accessor :content
9
+ attr_accessor :date
10
+ attr_accessor :author
11
+ attr_accessor :addr
12
+ attr_accessor :user
13
+ attr_accessor :pass
14
+
15
+ def initialize(addrs=nil)
16
+ self.addr=addrs unless addrs.nil?
17
+ end
18
+
19
+ def send
20
+ raise StandardError.new('Missing Address') if addr.nil?
21
+ #3: Detailed control
22
+ url = URI.parse(addr)
23
+ req = Net::HTTP::Post.new(url.path)
24
+ req.basic_auth user,pass unless user.nil?
25
+
26
+ req.body = '<?xml version="1.0"?>'+"\n"
27
+ req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
28
+ req.body +='<title>'+title+'</title>'+"\n"
29
+ req.body +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
30
+ req.body +='<updated>'+date.xmlschema+'</updated>'+"\n"
31
+ req.body +='<author><name>'+author+'</name></author>'+"\n"
32
+ req.body +='<content>'+content+'</content>'+"\n"
33
+ req.body +='</entry>'+"\n"
34
+
35
+ req.set_content_type('application/atom+xml;type=entry')
36
+
37
+ File.open('/tmp/test4.txt','w') do |f|
38
+ f << req.body
39
+ end
40
+
41
+ res = Net::HTTP.new(url.host, url.port).start {|http| http.request(req) }
42
+ case res
43
+ when Net::HTTPSuccess, Net::HTTPRedirection
44
+ # OK
45
+ else
46
+ res.error!
47
+ end
48
+ end
49
+ end
data/lib/conf.rb ADDED
@@ -0,0 +1,22 @@
1
+
2
+ module HTML2FB
3
+ class Conf
4
+ def initialize(file)
5
+ ['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
6
+ f=p+file
7
+ begin
8
+ if File.readable?(f) && File.exists?(f)
9
+ @conf=File.open(f,'r'){|txt| YAML::load(txt)}
10
+ return
11
+ end
12
+ rescue Exception => e
13
+ STDERR.puts('unreadable conf : '+f+"\n"+e)
14
+ end
15
+ end
16
+ end
17
+
18
+ def [](x)
19
+ @conf[x]
20
+ end
21
+ end
22
+ end
data/lib/document.rb ADDED
@@ -0,0 +1,56 @@
1
+ module HTML2FB
2
+
3
+ class Section
4
+ attr_accessor :title
5
+ attr_accessor :content
6
+
7
+ def initialize
8
+ @content=[]
9
+ end
10
+
11
+ def to_html
12
+ content.collect{|e|e.to_html}.join
13
+ end
14
+
15
+ def titles
16
+ tit=[]
17
+ content.each do |f|
18
+ if f.is_a?Section
19
+ tit.push f.title
20
+ else
21
+ tit.push '#text'
22
+ end
23
+ end
24
+
25
+ return [title,tit]
26
+ end
27
+
28
+ def to_s
29
+ return "title :#{title} \n"+content.collect{|a|a.to_s}.join("\n\n")
30
+ end
31
+ end
32
+
33
+ class Document < Section
34
+ def toc
35
+ #return content
36
+ return content.collect{|a|a.titles}
37
+ end
38
+
39
+ end
40
+
41
+ class Text
42
+ attr_accessor :content
43
+
44
+ def initialize(c='')
45
+ @content=c
46
+ end
47
+
48
+ def to_html
49
+ @content
50
+ end
51
+
52
+ def to_s
53
+ @content
54
+ end
55
+ end
56
+ end
data/lib/downloader.rb ADDED
@@ -0,0 +1,24 @@
1
+ require 'open-uri'
2
+ require 'tempfile'
3
+
4
+ module HTML2FB
5
+ class Downloader
6
+ def self.valid_url?(entry)
7
+ uri=URI.parse(entry)
8
+ Kernel.open(uri.to_s,'r')
9
+ return uri
10
+ end
11
+
12
+ def self.download(uri)
13
+ print "Downloading "
14
+ puts uri.to_s
15
+ #tmp=Tempfile.new(uri.gsub(/[^a-z0-9]/,'_'))
16
+ #tmp.open('w'){|a|
17
+ # uri.open('r'){|b|
18
+ # a.write b
19
+ # }
20
+ #}
21
+ Kernel.open(uri.to_s,'r').read
22
+ end
23
+ end
24
+ end
data/lib/feedbooks.rb ADDED
@@ -0,0 +1,99 @@
1
+ require 'app.rb'
2
+ require 'hpricot'
3
+ require 'digest/md5'
4
+
5
+ module HTML2FB
6
+
7
+ class FBSession
8
+
9
+ attr_accessor :bookid
10
+ attr_accessor :booktype
11
+ attr_accessor :user
12
+ attr_accessor :pass
13
+ attr_accessor :host
14
+ @@fbsession=nil
15
+ def initialize(conf)
16
+ StandardError.new('Already in session') unless @@fbsession.nil?
17
+ @@fbsession=self
18
+ self.bookid=ask(conf['fb']['bookid'],"Book Id")
19
+ self.booktype=ask(conf['fb']['booktype'],"Book Type")
20
+ self.user=ask(conf['fb']['user'],"User")
21
+ self.pass=ask(conf['fb']['pass'],"Pass")
22
+ self.host=conf['fb']['host']
23
+ self.host='feedbooks.com' if @host.nil?
24
+ end
25
+
26
+ def self.session
27
+ return @@fbsession
28
+ end
29
+
30
+ def pass=(pas)
31
+
32
+ if pas.gsub(/[^a-z0-9]/,'').size==32
33
+ @pass=pas
34
+ else
35
+ @pass= Digest::MD5.hexdigest(pas)
36
+ end
37
+ end
38
+ end
39
+
40
+
41
+ class Document
42
+ def to_feedbooks(conf)
43
+ FBSession.new(conf)
44
+ #File.open('/tmp/test3.html','w') do |f|
45
+ content.each do |e|
46
+ # f << e.to_feedbooks(conf)
47
+ e.to_feedbooks(conf)
48
+ # f << " \n " * 10
49
+ end
50
+ #end
51
+ end
52
+ end
53
+
54
+ class Section
55
+ @@level=0
56
+ def to_feedbooks(conf)
57
+ fb=FBSession.session
58
+ post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
59
+ doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
60
+ doc.traverse_all_element do |e|
61
+ unless e.is_a?Hpricot::Text
62
+ e.stag.name='xhtml:'+e.stag.name
63
+ e.etag.name='xhtml:'+e.etag.name unless e.etag.nil?
64
+ end
65
+ end
66
+ post.content=doc.to_html
67
+ post.user=fb.user
68
+ post.pass=fb.pass
69
+ post.date=Time.now
70
+ post.author=fb.user
71
+ post.title=title
72
+ post.send
73
+ end
74
+
75
+ alias :old_to_html :to_html
76
+
77
+ def to_html
78
+ ret=nil
79
+ @@level+=1
80
+ if @@level==1
81
+ ret=old_to_html
82
+ else
83
+ ret="<h#{@@level+1}>"+title+"</h#{@@level+1}>"+old_to_html
84
+ end
85
+ @@level-=1
86
+ ret
87
+ end
88
+ end
89
+ end
90
+
91
+ def ask(txt,disp='Prompt')
92
+ return txt unless txt.nil? || txt =='#ask#'
93
+ begin
94
+ txt=nil
95
+ print disp+' : '
96
+ txt=STDIN.readline.strip
97
+ end while txt.nil? || txt.size==0
98
+ txt
99
+ end
data/lib/parser.rb ADDED
@@ -0,0 +1,138 @@
1
+ require 'hpricot'
2
+ require 'document.rb'
3
+
4
+ module HTML2FB
5
+ class Parser
6
+
7
+ def initialize(conf)
8
+ @conf=conf
9
+ end
10
+
11
+ def parse(txt)
12
+ pdoc=Hpricot(txt)
13
+ doc=Document.new
14
+ remove_objs(pdoc)
15
+ ti=pdoc.at('title')
16
+ doc.title= ti.inner_text.strip unless ti.nil?
17
+ # pdoc.search('//h3').each do |e|
18
+ # doc.content.push(e.inner_text)
19
+ # end
20
+
21
+ parse_text(pdoc,doc)
22
+
23
+ return doc
24
+ end
25
+ protected
26
+
27
+ def remove_objs(doc)
28
+ if @conf['remove'] then
29
+ @conf['remove']['class'].each do |cl|
30
+ doc.search('.'+cl).remove
31
+ end unless @conf['remove']['class'].nil?
32
+ @conf['remove']['expr'].each do |cl|
33
+ doc.search(cl).remove
34
+ end unless @conf['remove']['expr'].nil?
35
+ @conf['remove']['before'].each do |cl|
36
+ x=doc.at(cl)
37
+ x.preceding.remove
38
+ x.parent.children.delete(x)
39
+ end unless @conf['remove']['before'].nil?
40
+ @conf['remove']['between'].each do |cl|
41
+ # puts "between "+cl.inspect
42
+ doc.between(cl.first,cl.last).remove
43
+ end unless @conf['remove']['between'].nil?
44
+ @conf['remove']['after'].each do |cl|
45
+ x=doc.at(cl)
46
+ x.following.remove
47
+ x.parent.children.delete(x)
48
+ end unless @conf['remove']['after'].nil?
49
+ end
50
+ File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
51
+ end
52
+
53
+ def parse_text(doc,ret)
54
+ ti = doc.search('//'+@conf['select']['expr'])
55
+ tit = ti.zip ti[1..-1]+[nil]
56
+
57
+ tit.each do |a|
58
+ s=Section.new
59
+ tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
60
+ tmp.sub!(a.first.to_original_html,'')
61
+ s.content =[Text.new(tmp)]
62
+ s.title = a.first.inner_text.to_s
63
+ ret.content.push s
64
+
65
+ end
66
+
67
+ if @conf['select']['select']
68
+ conf=@conf['select']
69
+ parse_rec(ret,conf)
70
+ end
71
+ end
72
+
73
+ protected
74
+
75
+ def parse_rec(el,conf)
76
+ return if conf.nil?
77
+ if el.is_a?Section
78
+ el.content.each do |l|
79
+ if l.is_a?Section
80
+ parse_rec(l,conf['select'])
81
+ else
82
+ doc=Hpricot(l.content)
83
+ ti = doc.search('//'+conf['expr'])
84
+ return if ti.size ==0
85
+ tit = ti.zip ti[1..-1]+[nil]
86
+
87
+ tit.each do |a|
88
+ s=Section.new
89
+ tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
90
+ s.content = [Text.new(tmp)]
91
+ s.title = a.first.inner_text.to_s
92
+ el.content.push s
93
+ l.content.sub!(tmp,'')
94
+ l.content.sub!(a.first.to_original_html,'')
95
+ end
96
+
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+
105
+ class String
106
+ def blank?
107
+ self==""
108
+ end
109
+ end
110
+
111
+ class NilClass
112
+ def blank?
113
+ true
114
+ end
115
+ end
116
+
117
+ module Hpricot::Traverse
118
+ def between(i,j)
119
+ #puts i,j
120
+ unless j.nil?
121
+ prec=self.at(i).preceding
122
+ Hpricot::Elements[*self.at(j).preceding.find_all{|el| !prec.include?el}]
123
+ else
124
+ self.at(i).following
125
+ end
126
+ end
127
+ end
128
+
129
+
130
+ class Hpricot::Elements
131
+ def between(i,j)
132
+ Hpricot::Elements[*self.collect{|a| a.between(i,j)}]
133
+ end
134
+
135
+ def -(a)
136
+ Hpricot::Elements[*self.find_all{|el| !a.include?el}]
137
+ end
138
+ end
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zetaben-Html2Feedbooks
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.1"
5
+ platform: ruby
6
+ authors:
7
+ - Benoit Larroque
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-28 00:00:00 -07:00
13
+ default_executable: html2fb.rb
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.6"
24
+ version:
25
+ description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
26
+ email: zeta dot ben at gmail dot com
27
+ executables:
28
+ - html2fb.rb
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - README
35
+ - confs/conf.yaml
36
+ - lib/app.rb
37
+ - lib/conf.rb
38
+ - lib/document.rb
39
+ - lib/downloader.rb
40
+ - lib/feedbooks.rb
41
+ - bin/html2fb.rb
42
+ - lib/parser.rb
43
+ has_rdoc: true
44
+ homepage: http://github.com/Html2Feedbooks
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ version:
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ requirements: []
63
+
64
+ rubyforge_project:
65
+ rubygems_version: 1.2.0
66
+ signing_key:
67
+ specification_version: 2
68
+ summary: Html2Feedbooks is script to automate basic publishing on feedbooks.com
69
+ test_files: []
70
+