zetaben-Html2Feedbooks 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -8,4 +8,6 @@ Usage
8
8
 
9
9
  ./html2fb URL.html
10
10
 
11
- You can change settings in confs/conf.yaml
11
+ You can change some settings by creating your own configuration file and using
12
+
13
+ html2fb -c myconf.yaml URL.html
data/bin/html2fb.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/ruby
2
+ require 'optparse'
2
3
  require 'open-uri'
3
4
  require 'conf.rb'
4
5
  require 'downloader.rb'
@@ -8,6 +9,20 @@ require 'feedbooks.rb'
8
9
 
9
10
  include HTML2FB
10
11
 
12
+ options = {}
13
+ options[:conf] = "conf.yaml"
14
+ options[:preview] = true
15
+ OptionParser.new do |opts|
16
+ opts.banner = "Usage: html2fb [options] URL"
17
+
18
+ opts.on("-c", "--conf FILE", String,"Configuration file") do |f|
19
+ options[:conf] = f
20
+ end
21
+ opts.on("-s", "-s","Send to feedbooks") do |f|
22
+ options[:preview] = !f
23
+ end
24
+ end.parse!
25
+
11
26
  valid=false
12
27
  entry=ARGV[0]
13
28
  while !valid
@@ -23,9 +38,16 @@ while !valid
23
38
  print "URL : " if entry.nil? || entry==''
24
39
  entry=STDIN.readline.strip unless valid
25
40
  end
26
- conf=Conf.new('conf.yaml')
41
+ conf=Conf.new(options[:conf])
27
42
  content=Downloader.download(url)
28
43
  #puts content.size
29
44
  doc=Parser.new(conf).parse(content)
30
45
  puts doc.toc.to_yaml
46
+ if options[:preview]
47
+ f=File.open('/tmp/plop.html','w')
48
+ f.write doc.to_html
49
+ f.close
50
+ `firefox /tmp/plop.html`
51
+ else
31
52
  doc.to_feedbooks(conf)
53
+ end
data/lib/app.rb CHANGED
@@ -2,6 +2,7 @@ require 'digest/md5'
2
2
  require 'open-uri'
3
3
  require 'net/http'
4
4
  require 'time'
5
+ require 'htmlentities'
5
6
 
6
7
  class AtomPost
7
8
  attr_accessor :title
@@ -25,11 +26,11 @@ class AtomPost
25
26
 
26
27
  req.body = '<?xml version="1.0"?>'+"\n"
27
28
  req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
28
- req.body +='<title>'+title+'</title>'+"\n"
29
+ req.body +='<title>'+recode_text(title)+'</title>'+"\n"
29
30
  req.body +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
30
31
  req.body +='<updated>'+date.xmlschema+'</updated>'+"\n"
31
32
  req.body +='<author><name>'+author+'</name></author>'+"\n"
32
- req.body +='<content>'+content+'</content>'+"\n"
33
+ req.body +='<content>'+recode_text(content)+'</content>'+"\n"
33
34
  req.body +='</entry>'+"\n"
34
35
 
35
36
  req.set_content_type('application/atom+xml;type=entry')
@@ -46,4 +47,15 @@ class AtomPost
46
47
  res.error!
47
48
  end
48
49
  end
50
+
51
+ def recode_text(txt)
52
+ return txt if txt.blank?
53
+ m=Hpricot(txt)
54
+ m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
55
+ m.to_html
56
+ end
57
+ HTMLENCODER=HTMLEntities.new
58
+ def force_decimal_entities(txt)
59
+ HTMLENCODER.encode(HTMLENCODER.decode(txt),:decimal)
60
+ end
49
61
  end
data/lib/conf.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  module HTML2FB
3
3
  class Conf
4
4
  def initialize(file)
5
- ['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
5
+ ['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
6
6
  f=p+file
7
7
  begin
8
8
  if File.readable?(f) && File.exists?(f)
data/lib/feedbooks.rb CHANGED
@@ -54,6 +54,7 @@ module HTML2FB
54
54
  class Section
55
55
  @@level=0
56
56
  def to_feedbooks(conf)
57
+ puts "Sending to feedbooks"
57
58
  fb=FBSession.session
58
59
  post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
59
60
  doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
data/lib/parser.rb CHANGED
@@ -8,16 +8,27 @@ module HTML2FB
8
8
  @conf=conf
9
9
  end
10
10
 
11
+ def extract_text(n)
12
+ t=''
13
+ n.traverse_all_element do |e|
14
+ t+=e.content.to_s if e.is_a?(Hpricot::Text)
15
+ end
16
+ t
17
+ end
18
+
11
19
  def parse(txt)
20
+ puts "Parsing HTML"
12
21
  pdoc=Hpricot(txt)
13
22
  doc=Document.new
23
+ puts "Removing garbage elements"
14
24
  remove_objs(pdoc)
15
25
  ti=pdoc.at('title')
16
- doc.title= ti.inner_text.strip unless ti.nil?
26
+ doc.title= extract_text(ti).strip unless ti.nil?
17
27
  # pdoc.search('//h3').each do |e|
18
28
  # doc.content.push(e.inner_text)
19
29
  # end
20
30
 
31
+ puts "Building TOC"
21
32
  parse_text(pdoc,doc)
22
33
 
23
34
  return doc
@@ -34,8 +45,10 @@ module HTML2FB
34
45
  end unless @conf['remove']['expr'].nil?
35
46
  @conf['remove']['before'].each do |cl|
36
47
  x=doc.at(cl)
48
+ if x
37
49
  x.preceding.remove
38
50
  x.parent.children.delete(x)
51
+ end
39
52
  end unless @conf['remove']['before'].nil?
40
53
  @conf['remove']['between'].each do |cl|
41
54
  # puts "between "+cl.inspect
@@ -43,11 +56,13 @@ module HTML2FB
43
56
  end unless @conf['remove']['between'].nil?
44
57
  @conf['remove']['after'].each do |cl|
45
58
  x=doc.at(cl)
59
+ if x
46
60
  x.following.remove
47
61
  x.parent.children.delete(x)
62
+ end
48
63
  end unless @conf['remove']['after'].nil?
49
64
  end
50
- File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
65
+ # File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
51
66
  end
52
67
 
53
68
  def parse_text(doc,ret)
@@ -59,7 +74,8 @@ module HTML2FB
59
74
  tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
60
75
  tmp.sub!(a.first.to_original_html,'')
61
76
  s.content =[Text.new(tmp)]
62
- s.title = a.first.inner_text.to_s
77
+ #buggy with entities
78
+ s.title = extract_text(a.first)
63
79
  ret.content.push s
64
80
 
65
81
  end
@@ -88,7 +104,7 @@ module HTML2FB
88
104
  s=Section.new
89
105
  tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
90
106
  s.content = [Text.new(tmp)]
91
- s.title = a.first.inner_text.to_s
107
+ s.title = extract_text(a.first)
92
108
  el.content.push s
93
109
  l.content.sub!(tmp,'')
94
110
  l.content.sub!(a.first.to_original_html,'')
@@ -117,13 +133,25 @@ end
117
133
  module Hpricot::Traverse
118
134
  def between(i,j)
119
135
  #puts i,j
120
- unless j.nil?
121
- prec=self.at(i).preceding
122
- Hpricot::Elements[*self.at(j).preceding.find_all{|el| !prec.include?el}]
136
+ unless j.nil? || self.at(j).nil?
137
+ prec=self.at(i).deep_preceding
138
+ Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
123
139
  else
124
- self.at(i).following
140
+ self.at(i).deep_following unless self.at(i).nil?
125
141
  end
126
142
  end
143
+
144
+ def deep_preceding()
145
+ ret=Hpricot::Elements[]
146
+ ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
147
+ ret+=preceding
148
+ Hpricot::Elements[*ret]
149
+ end
150
+ def deep_following()
151
+ ret=following
152
+ ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
153
+ Hpricot::Elements[*ret]
154
+ end
127
155
  end
128
156
 
129
157
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zetaben-Html2Feedbooks
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: "0.2"
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benoit Larroque
@@ -22,6 +22,16 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: "0.6"
24
24
  version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: htmlentities
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "4.0"
34
+ version:
25
35
  description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
26
36
  email: zeta dot ben at gmail dot com
27
37
  executables: