zetaben-Html2Feedbooks 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -8,4 +8,6 @@ Usage
8
8
 
9
9
  ./html2fb URL.html
10
10
 
11
- You can change settings in confs/conf.yaml
11
+ You can change some settings by creating your own configuration file and using
12
+
13
+ html2fb -c myconf.yaml URL.html
data/bin/html2fb.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/ruby
2
+ require 'optparse'
2
3
  require 'open-uri'
3
4
  require 'conf.rb'
4
5
  require 'downloader.rb'
@@ -8,6 +9,20 @@ require 'feedbooks.rb'
8
9
 
9
10
  include HTML2FB
10
11
 
12
+ options = {}
13
+ options[:conf] = "conf.yaml"
14
+ options[:preview] = true
15
+ OptionParser.new do |opts|
16
+ opts.banner = "Usage: html2fb [options] URL"
17
+
18
+ opts.on("-c", "--conf FILE", String,"Configuration file") do |f|
19
+ options[:conf] = f
20
+ end
21
+ opts.on("-s", "-s","Send to feedbooks") do |f|
22
+ options[:preview] = !f
23
+ end
24
+ end.parse!
25
+
11
26
  valid=false
12
27
  entry=ARGV[0]
13
28
  while !valid
@@ -23,9 +38,16 @@ while !valid
23
38
  print "URL : " if entry.nil? || entry==''
24
39
  entry=STDIN.readline.strip unless valid
25
40
  end
26
- conf=Conf.new('conf.yaml')
41
+ conf=Conf.new(options[:conf])
27
42
  content=Downloader.download(url)
28
43
  #puts content.size
29
44
  doc=Parser.new(conf).parse(content)
30
45
  puts doc.toc.to_yaml
46
+ if options[:preview]
47
+ f=File.open('/tmp/plop.html','w')
48
+ f.write doc.to_html
49
+ f.close
50
+ `firefox /tmp/plop.html`
51
+ else
31
52
  doc.to_feedbooks(conf)
53
+ end
data/lib/app.rb CHANGED
@@ -2,6 +2,7 @@ require 'digest/md5'
2
2
  require 'open-uri'
3
3
  require 'net/http'
4
4
  require 'time'
5
+ require 'htmlentities'
5
6
 
6
7
  class AtomPost
7
8
  attr_accessor :title
@@ -25,11 +26,11 @@ class AtomPost
25
26
 
26
27
  req.body = '<?xml version="1.0"?>'+"\n"
27
28
  req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
28
- req.body +='<title>'+title+'</title>'+"\n"
29
+ req.body +='<title>'+recode_text(title)+'</title>'+"\n"
29
30
  req.body +='<id>'+Digest::MD5.hexdigest(title+content)+'</id>'+"\n"
30
31
  req.body +='<updated>'+date.xmlschema+'</updated>'+"\n"
31
32
  req.body +='<author><name>'+author+'</name></author>'+"\n"
32
- req.body +='<content>'+content+'</content>'+"\n"
33
+ req.body +='<content>'+recode_text(content)+'</content>'+"\n"
33
34
  req.body +='</entry>'+"\n"
34
35
 
35
36
  req.set_content_type('application/atom+xml;type=entry')
@@ -46,4 +47,15 @@ class AtomPost
46
47
  res.error!
47
48
  end
48
49
  end
50
+
51
+ def recode_text(txt)
52
+ return txt if txt.blank?
53
+ m=Hpricot(txt)
54
+ m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
55
+ m.to_html
56
+ end
57
+ HTMLENCODER=HTMLEntities.new
58
+ def force_decimal_entities(txt)
59
+ HTMLENCODER.encode(HTMLENCODER.decode(txt),:decimal)
60
+ end
49
61
  end
data/lib/conf.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  module HTML2FB
3
3
  class Conf
4
4
  def initialize(file)
5
- ['./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
5
+ ['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
6
6
  f=p+file
7
7
  begin
8
8
  if File.readable?(f) && File.exists?(f)
data/lib/feedbooks.rb CHANGED
@@ -54,6 +54,7 @@ module HTML2FB
54
54
  class Section
55
55
  @@level=0
56
56
  def to_feedbooks(conf)
57
+ puts "Sending to feedbooks"
57
58
  fb=FBSession.session
58
59
  post=AtomPost.new "http://#{fb.host}/#{fb.booktype}/#{fb.bookid}/contents.atom"
59
60
  doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+to_html+'</div>')
data/lib/parser.rb CHANGED
@@ -8,16 +8,27 @@ module HTML2FB
8
8
  @conf=conf
9
9
  end
10
10
 
11
+ def extract_text(n)
12
+ t=''
13
+ n.traverse_all_element do |e|
14
+ t+=e.content.to_s if e.is_a?(Hpricot::Text)
15
+ end
16
+ t
17
+ end
18
+
11
19
  def parse(txt)
20
+ puts "Parsing HTML"
12
21
  pdoc=Hpricot(txt)
13
22
  doc=Document.new
23
+ puts "Removing garbage elements"
14
24
  remove_objs(pdoc)
15
25
  ti=pdoc.at('title')
16
- doc.title= ti.inner_text.strip unless ti.nil?
26
+ doc.title= extract_text(ti).strip unless ti.nil?
17
27
  # pdoc.search('//h3').each do |e|
18
28
  # doc.content.push(e.inner_text)
19
29
  # end
20
30
 
31
+ puts "Building TOC"
21
32
  parse_text(pdoc,doc)
22
33
 
23
34
  return doc
@@ -34,8 +45,10 @@ module HTML2FB
34
45
  end unless @conf['remove']['expr'].nil?
35
46
  @conf['remove']['before'].each do |cl|
36
47
  x=doc.at(cl)
48
+ if x
37
49
  x.preceding.remove
38
50
  x.parent.children.delete(x)
51
+ end
39
52
  end unless @conf['remove']['before'].nil?
40
53
  @conf['remove']['between'].each do |cl|
41
54
  # puts "between "+cl.inspect
@@ -43,11 +56,13 @@ module HTML2FB
43
56
  end unless @conf['remove']['between'].nil?
44
57
  @conf['remove']['after'].each do |cl|
45
58
  x=doc.at(cl)
59
+ if x
46
60
  x.following.remove
47
61
  x.parent.children.delete(x)
62
+ end
48
63
  end unless @conf['remove']['after'].nil?
49
64
  end
50
- File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
65
+ # File.open('/tmp/test.html','w'){|f| f.write doc.to_html}
51
66
  end
52
67
 
53
68
  def parse_text(doc,ret)
@@ -59,7 +74,8 @@ module HTML2FB
59
74
  tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
60
75
  tmp.sub!(a.first.to_original_html,'')
61
76
  s.content =[Text.new(tmp)]
62
- s.title = a.first.inner_text.to_s
77
+ #buggy with entities
78
+ s.title = extract_text(a.first)
63
79
  ret.content.push s
64
80
 
65
81
  end
@@ -88,7 +104,7 @@ module HTML2FB
88
104
  s=Section.new
89
105
  tmp=doc.between(a.first.xpath,a.last.nil? ? nil : a.last.xpath).to_html
90
106
  s.content = [Text.new(tmp)]
91
- s.title = a.first.inner_text.to_s
107
+ s.title = extract_text(a.first)
92
108
  el.content.push s
93
109
  l.content.sub!(tmp,'')
94
110
  l.content.sub!(a.first.to_original_html,'')
@@ -117,13 +133,25 @@ end
117
133
  module Hpricot::Traverse
118
134
  def between(i,j)
119
135
  #puts i,j
120
- unless j.nil?
121
- prec=self.at(i).preceding
122
- Hpricot::Elements[*self.at(j).preceding.find_all{|el| !prec.include?el}]
136
+ unless j.nil? || self.at(j).nil?
137
+ prec=self.at(i).deep_preceding
138
+ Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !prec.include?el}]
123
139
  else
124
- self.at(i).following
140
+ self.at(i).deep_following unless self.at(i).nil?
125
141
  end
126
142
  end
143
+
144
+ def deep_preceding()
145
+ ret=Hpricot::Elements[]
146
+ ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
147
+ ret+=preceding
148
+ Hpricot::Elements[*ret]
149
+ end
150
+ def deep_following()
151
+ ret=following
152
+ ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
153
+ Hpricot::Elements[*ret]
154
+ end
127
155
  end
128
156
 
129
157
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zetaben-Html2Feedbooks
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: "0.2"
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benoit Larroque
@@ -22,6 +22,16 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: "0.6"
24
24
  version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: htmlentities
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "4.0"
34
+ version:
25
35
  description: Html2Feedbooks is script to automate basic publishing on feedbooks.com
26
36
  email: zeta dot ben at gmail dot com
27
37
  executables: