Html2Feedbooks 1.1.1 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in html2fb.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/html2fb.rb CHANGED
@@ -1,14 +1,11 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  require 'optparse'
3
4
  require 'open-uri'
4
- require 'conf.rb'
5
- require 'downloader.rb'
6
- require 'document.rb'
7
- require 'parser.rb'
8
- require 'feedbooks.rb'
9
5
  require 'tmpdir'
10
6
  require 'launchy'
11
7
  require 'digest/md5'
8
+ require 'lib/html2fb'
12
9
 
13
10
  include HTML2FB
14
11
 
@@ -86,7 +83,7 @@ if options[:preview]
86
83
  puts "A preview of the parsed file should be opening in your webbrowser now"
87
84
  puts "If nothing open you can open the file located at : #{page}"
88
85
  puts "When happy with the parsed output rerun with -s option to send to Feedbooks.com"
89
- Launchy::Browser.run(page)
86
+ Launchy.open(page)
90
87
  else
91
88
  doc.to_feedbooks(conf)
92
89
  end
data/html2fb.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "html2fb/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "Html2Feedbooks"
7
+ s.version = Html2fb::VERSION
8
+ s.authors = ["Benoit Larroque"]
9
+ s.email = ["benoit dot larroque at feedbooks dot com"]
10
+ s.summary = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
11
+ s.homepage = %q{http://github.com/zetaben/Html2Feedbooks}
12
+ s.description = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+ s.default_executable = 'html2fb.rb'
19
+ s.add_dependency('nokogiri','>=1.4.1')
20
+ s.add_dependency('htmlentities', '>= 4.2.1')
21
+ s.add_dependency('launchy', '>= 2.0.0')
22
+ s.add_dependency('progressbar', '>= 0.0.3')
23
+
24
+ # specify any dependencies here; for example:
25
+ # s.add_development_dependency "rspec"
26
+ # s.add_runtime_dependency "rest-client"
27
+ end
@@ -34,8 +34,8 @@ class AtomPost
34
34
  req = Net::HTTP::Get.new(url.path)
35
35
  req.basic_auth user,pass unless user.nil?
36
36
  response = http.request(req)
37
- doc=Hpricot(response.body)
38
- e=doc.at('//entry').at('link[@rel="down"]')
37
+ doc=Nokogiri::XML(response.body).remove_namespaces!
38
+ e=doc.at('//entry/link[@rel="down"]')
39
39
  return URI.parse(e[:href]).path unless e.nil?
40
40
  }
41
41
  end
@@ -47,7 +47,6 @@ class AtomPost
47
47
  #STDERR.puts "sending to #{url}"
48
48
  req = Net::HTTP::Post.new(url.path)
49
49
  req.basic_auth user,pass unless user.nil?
50
-
51
50
  req.body = '<?xml version="1.0"?>'+"\n"
52
51
  req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
53
52
  req.body +='<title>'+decode_text(title)+'</title>'+"\n"
@@ -77,9 +76,9 @@ class AtomPost
77
76
 
78
77
  def recode_text(txt)
79
78
  return txt if txt.blank?
80
- m=Hpricot(txt)
81
- m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
82
- m.to_html
79
+ m=Nokogiri::XML("<text>#{txt}</text>")
80
+ m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
81
+ m.root.inner_html
83
82
  end
84
83
  HTMLENCODER=HTMLEntities.new
85
84
  def force_decimal_entities(txt)
@@ -88,9 +87,9 @@ class AtomPost
88
87
 
89
88
  def decode_text(txt)
90
89
  return txt if txt.blank?
91
- m=Hpricot(txt)
92
- m.traverse_text{|t| HTMLENCODER.decode(t.content)}
93
- m.to_html
90
+ m=Nokogiri::XML("<text>#{txt}</text>")
91
+ m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
92
+ m.root.inner_html
94
93
  end
95
94
 
96
95
  end
File without changes
File without changes
File without changes
@@ -1,5 +1,5 @@
1
- require 'app.rb'
2
- require 'hpricot'
1
+ require 'html2fb/app.rb'
2
+ require 'nokogiri'
3
3
  require 'digest/md5'
4
4
 
5
5
  module HTML2FB
@@ -105,11 +105,10 @@ module HTML2FB
105
105
  def to_feedbooks(conf,path=nil)
106
106
  stxt=to_html
107
107
  return unless stxt.strip.size > 0
108
- doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
109
- doc.traverse_all_element do |e|
110
- unless e.is_a?Hpricot::Text
108
+ doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
109
+ doc.traverse do |e|
110
+ if e.element?
111
111
  e.name='xhtml:'+e.name
112
- e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
113
112
  end
114
113
  end
115
114
  FBPost.push(conf,'',doc.to_html,"Text",path)
@@ -1,5 +1,5 @@
1
- require 'hpricot'
2
- require 'document.rb'
1
+ require 'nokogiri'
2
+ require 'html2fb/document.rb'
3
3
  require 'progressbar'
4
4
  #require 'ruby-prof'
5
5
  #require 'term/ansicolor'
@@ -14,11 +14,11 @@ module HTML2FB
14
14
 
15
15
  def parse(txt)
16
16
  puts "Parsing HTML"
17
- pdoc=Hpricot(txt)
17
+ pdoc=Nokogiri::HTML(txt)
18
18
  if @conf['conv']
19
19
  mc=pdoc/'meta[@http-equiv="Content-Type"]'
20
20
  if mc.size>0
21
- charset=mc.first.attributes['content'].split(';').find do |s|
21
+ charset=mc.first.attributes['content'].to_s.split(';').find do |s|
22
22
  s.strip[0,7]=='charset'
23
23
  end
24
24
  unless charset.nil?
@@ -28,7 +28,7 @@ module HTML2FB
28
28
  unless tc.nil?
29
29
  puts "Trying to convert source encoding from #{tc} to utf-8"
30
30
  require 'iconv'
31
- pdoc=Hpricot(Iconv.conv('utf-8',tc.downcase,txt))
31
+ pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
32
32
 
33
33
  end
34
34
 
@@ -38,7 +38,7 @@ module HTML2FB
38
38
  puts "Removing garbage elements"
39
39
  remove_objs(pdoc)
40
40
  ti=pdoc.at('title')
41
- doc.title= ti.extract_text.strip unless ti.nil?
41
+ doc.title= ti.text.strip unless ti.nil?
42
42
  # pdoc.search('//h3').each do |e|
43
43
  # doc.content.push(e.inner_text)
44
44
  # end
@@ -58,10 +58,10 @@ module HTML2FB
58
58
  doc.search('.'+cl).remove
59
59
  end unless @conf['remove']['class'].nil?
60
60
  @conf['remove']['expr'].each do |cl|
61
- doc.search(cl).remove
61
+ doc.search(cl).remove rescue doc.xpath(cl).remove
62
62
  end unless @conf['remove']['expr'].nil?
63
63
  @conf['remove']['before'].each do |cl|
64
- x=doc.at(cl)
64
+ x=doc.at(cl) rescue doc.at_xpath(cl)
65
65
  if x
66
66
  x.preceding.remove
67
67
  x.parent.children.delete(x)
@@ -73,7 +73,7 @@ module HTML2FB
73
73
  t.remove unless t.nil?
74
74
  end unless @conf['remove']['between'].nil?
75
75
  @conf['remove']['after'].each do |cl|
76
- x=doc.at(cl)
76
+ x=doc.at(cl) rescue doc.at_xpath(cl)
77
77
  if x
78
78
  x.following.remove
79
79
  x.parent.children.delete(x)
@@ -89,13 +89,13 @@ module HTML2FB
89
89
 
90
90
  aut=build_autom(@conf['select'],ret)
91
91
 
92
- pbar = ProgressBar.new("Parsing", doc.search('//').size)
93
- doc.traverse_all_element do |el|
92
+ pbar = ProgressBar.new("Parsing", doc.search('//*').size)
93
+ doc.traverse do |el|
94
94
  aut.feed(el)
95
95
  pbar.inc
96
96
  end
97
- pbar.finish
98
97
  aut.finish(doc)
98
+ pbar.finish
99
99
  =begin
100
100
  result = RubyProf.stop
101
101
  printer = RubyProf::FlatPrinter.new(result)
@@ -180,10 +180,10 @@ module HTML2FB
180
180
  if @content=='body'
181
181
  tmp=el.preceding[0..-1]
182
182
  else
183
- tmp=el.root.search(@content...(el.xpath))[1..-1]
183
+ tmp=el.root.between(@content,(el.path),true)[1..-1]
184
184
  end
185
185
  if tmp.blank? #search can'find between siblins
186
- tmp=el.root.deep_between(@content,(el.xpath))
186
+ tmp=el.root.deep_between(@content,(el.path))
187
187
  end
188
188
  unless tmp.blank?
189
189
  tmph=tmp.to_html
@@ -195,7 +195,7 @@ module HTML2FB
195
195
  (lvl..@max_level).to_a.reverse.each do |l|
196
196
  close_section(l)
197
197
  end
198
- @starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
198
+ @starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
199
199
  @content=obj[:xpath]
200
200
  @current_level=lvl
201
201
  end
@@ -209,7 +209,7 @@ module HTML2FB
209
209
  end
210
210
 
211
211
  def feed(el)
212
- return if el.is_a?Hpricot::Text
212
+ return if el.text?
213
213
  @done=[[]*@levels.size]
214
214
 
215
215
  @levels.each_with_index do |lvl,i|
@@ -218,7 +218,7 @@ module HTML2FB
218
218
  if el.in_search?(expr['expr'])
219
219
 
220
220
 
221
- open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
221
+ open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
222
222
  break
223
223
  end
224
224
  end
@@ -228,6 +228,9 @@ module HTML2FB
228
228
  end
229
229
  end
230
230
 
231
+ class Nokogiri::XML::NodeSet
232
+ alias :blank? :empty?
233
+ end
231
234
 
232
235
  class String
233
236
  def blank?
@@ -241,17 +244,22 @@ class NilClass
241
244
  end
242
245
  end
243
246
 
244
- module Hpricot::Traverse
247
+
248
+
249
+ class Nokogiri::XML::Node
250
+
245
251
  def in_search?(expr)
246
252
  if expr !~ /[^a-z0-9]/
247
253
  return self.name.downcase()==expr.downcase()
248
254
  end
249
255
 
250
- se_in=self.parent
256
+ se_in=self.root
257
+ se_in=self.parent if self.respond_to?(:parent)
251
258
  if expr[0..1]=='/'
252
259
  se_in=self.root
253
260
  end
254
- se_in.search(expr).each do |el|
261
+ set=se_in.search(expr) rescue se_in.xpath(expr)
262
+ set.each do |el|
255
263
  return true if el==self
256
264
  end
257
265
  # puts self.name+" "+expr
@@ -259,34 +267,60 @@ module Hpricot::Traverse
259
267
  end
260
268
 
261
269
  def root
262
- return @root unless @root.nil?
263
- se_in=self
264
- se_in=se_in.parent until se_in.parent.nil?
265
- @root=se_in
266
- se_in
270
+ self.document.root
267
271
  end
268
272
 
269
- def between(a,b)
270
- root.search(a..b)
273
+ def node_position
274
+ return @node_position if @node_position
275
+ @node_position=parent.children.index(self)
271
276
  end
272
277
 
273
- def extract_text
274
- t=''
275
- self.traverse_all_element do |e|
276
- t+=e.content.to_s if e.is_a?(Hpricot::Text)
278
+ def between(a,b,excl=false)
279
+
280
+ #from nokogiri
281
+ offset=(excl ? -1 : 0)
282
+ ary = []
283
+ ele1=at(a) rescue at_xpath(a)
284
+ ele2=at(b) rescue at_xpath(b)
285
+
286
+ if ele1 and ele2
287
+ # let's quickly take care of siblings
288
+ if ele1.parent == ele2.parent
289
+
290
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
291
+ else
292
+ # find common parent
293
+ ele1_p=ele1.ancestors
294
+ ele2_p=ele2.ancestors
295
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
296
+
297
+ child = nil
298
+ if ele1 == common_parent
299
+ child = ele2
300
+ elsif ele2 == common_parent
301
+ child = ele1
302
+ end
303
+
304
+ if child
305
+ ary = common_parent.children[0..(child.node_position+offset)]
306
+ end
307
+ end
277
308
  end
278
- t
309
+
310
+ return Nokogiri::XML::NodeSet.new(ele1.document,ary)
279
311
  end
280
- def deep_between(i,j)
281
312
 
282
- unless j.nil? || self.at(j).nil?
283
- tm=self.at(i)
313
+
314
+
315
+ def deep_between(i,j)
316
+ unless j.nil? || self.at_xpath(j).nil?
317
+ tm=self.at_xpath(i)
284
318
  prec=tm.deep_preceding
285
- r=Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}]
319
+ r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
286
320
  else
287
321
  r=self.at(i).deep_following unless self.at(i).nil?
288
322
  end
289
- Hpricot::Elements[*select_end(r,i)]
323
+ Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
290
324
  end
291
325
 
292
326
  def select_end(tab,expr)
@@ -296,13 +330,15 @@ module Hpricot::Traverse
296
330
  idx=-1
297
331
  i=0
298
332
  tab.each do |e|
299
- if e.search(expr.gsub(e.xpath,'.')).size > 0
333
+ nxp=expr.gsub(e.path,'.')
334
+ set=e.search(nxp) rescue e.xpath(nxp)
335
+ if set.size > 0
300
336
  idx=i
301
337
  #if e.search(i).size > 0
302
- if e.children.find{|ee| ee.xpath==expr }
338
+ if e.children.find{|ee| ee.path==expr }
303
339
  e.children.each do |ee|
304
340
  s << ee if f
305
- f=true if ee.xpath==expr
341
+ f=true if ee.path==expr
306
342
  end
307
343
  else
308
344
  s=select_end(e.children,expr)
@@ -316,20 +352,24 @@ module Hpricot::Traverse
316
352
  return s+tab[(idx+1)..-1]
317
353
  end
318
354
 
355
+ def preceding
356
+ self.parent.children[0...node_position]
357
+ end
358
+
359
+ def following
360
+ self.parent.children[node_position+1..-1]
361
+ end
362
+
319
363
  def deep_preceding()
320
- ret=Hpricot::Elements[]
321
- ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
364
+ ret=Nokogiri::XML::NodeSet.new(self.document,[])
365
+ ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
322
366
  ret+=preceding
323
- Hpricot::Elements[*ret]
367
+ ret
324
368
  end
325
369
  def deep_following()
326
370
  ret=following
327
- ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
328
- Hpricot::Elements[*ret]
371
+ ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
372
+ ret
329
373
  end
330
374
 
331
375
  end
332
-
333
- class Hpricot::Elements
334
- alias_method :blank?, :empty?
335
- end
@@ -0,0 +1,3 @@
1
+ module Html2fb
2
+ VERSION = "1.3.1"
3
+ end
data/lib/html2fb.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "html2fb/version"
2
+
3
+ require 'html2fb/conf.rb'
4
+ require 'html2fb/downloader.rb'
5
+ require 'html2fb/document.rb'
6
+ require 'html2fb/parser.rb'
7
+ require 'html2fb/feedbooks.rb'