Html2Feedbooks 1.1.1 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in html2fb.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/html2fb.rb CHANGED
@@ -1,14 +1,11 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
2
3
  require 'optparse'
3
4
  require 'open-uri'
4
- require 'conf.rb'
5
- require 'downloader.rb'
6
- require 'document.rb'
7
- require 'parser.rb'
8
- require 'feedbooks.rb'
9
5
  require 'tmpdir'
10
6
  require 'launchy'
11
7
  require 'digest/md5'
8
+ require 'lib/html2fb'
12
9
 
13
10
  include HTML2FB
14
11
 
@@ -86,7 +83,7 @@ if options[:preview]
86
83
  puts "A preview of the parsed file should be opening in your webbrowser now"
87
84
  puts "If nothing open you can open the file located at : #{page}"
88
85
  puts "When happy with the parsed output rerun with -s option to send to Feedbooks.com"
89
- Launchy::Browser.run(page)
86
+ Launchy.open(page)
90
87
  else
91
88
  doc.to_feedbooks(conf)
92
89
  end
data/html2fb.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "html2fb/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "Html2Feedbooks"
7
+ s.version = Html2fb::VERSION
8
+ s.authors = ["Benoit Larroque"]
9
+ s.email = ["benoit dot larroque at feedbooks dot com"]
10
+ s.summary = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
11
+ s.homepage = %q{http://github.com/zetaben/Html2Feedbooks}
12
+ s.description = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+ s.default_executable = 'html2fb.rb'
19
+ s.add_dependency('nokogiri','>=1.4.1')
20
+ s.add_dependency('htmlentities', '>= 4.2.1')
21
+ s.add_dependency('launchy', '>= 2.0.0')
22
+ s.add_dependency('progressbar', '>= 0.0.3')
23
+
24
+ # specify any dependencies here; for example:
25
+ # s.add_development_dependency "rspec"
26
+ # s.add_runtime_dependency "rest-client"
27
+ end
@@ -34,8 +34,8 @@ class AtomPost
34
34
  req = Net::HTTP::Get.new(url.path)
35
35
  req.basic_auth user,pass unless user.nil?
36
36
  response = http.request(req)
37
- doc=Hpricot(response.body)
38
- e=doc.at('//entry').at('link[@rel="down"]')
37
+ doc=Nokogiri::XML(response.body).remove_namespaces!
38
+ e=doc.at('//entry/link[@rel="down"]')
39
39
  return URI.parse(e[:href]).path unless e.nil?
40
40
  }
41
41
  end
@@ -47,7 +47,6 @@ class AtomPost
47
47
  #STDERR.puts "sending to #{url}"
48
48
  req = Net::HTTP::Post.new(url.path)
49
49
  req.basic_auth user,pass unless user.nil?
50
-
51
50
  req.body = '<?xml version="1.0"?>'+"\n"
52
51
  req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
53
52
  req.body +='<title>'+decode_text(title)+'</title>'+"\n"
@@ -77,9 +76,9 @@ class AtomPost
77
76
 
78
77
  def recode_text(txt)
79
78
  return txt if txt.blank?
80
- m=Hpricot(txt)
81
- m.traverse_text{|t| t.content=force_decimal_entities(t.content) if t.content.match(/&[a-z][a-z0-9]+;/i)}
82
- m.to_html
79
+ m=Nokogiri::XML("<text>#{txt}</text>")
80
+ m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
81
+ m.root.inner_html
83
82
  end
84
83
  HTMLENCODER=HTMLEntities.new
85
84
  def force_decimal_entities(txt)
@@ -88,9 +87,9 @@ class AtomPost
88
87
 
89
88
  def decode_text(txt)
90
89
  return txt if txt.blank?
91
- m=Hpricot(txt)
92
- m.traverse_text{|t| HTMLENCODER.decode(t.content)}
93
- m.to_html
90
+ m=Nokogiri::XML("<text>#{txt}</text>")
91
+ m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
92
+ m.root.inner_html
94
93
  end
95
94
 
96
95
  end
File without changes
File without changes
File without changes
@@ -1,5 +1,5 @@
1
- require 'app.rb'
2
- require 'hpricot'
1
+ require 'html2fb/app.rb'
2
+ require 'nokogiri'
3
3
  require 'digest/md5'
4
4
 
5
5
  module HTML2FB
@@ -105,11 +105,10 @@ module HTML2FB
105
105
  def to_feedbooks(conf,path=nil)
106
106
  stxt=to_html
107
107
  return unless stxt.strip.size > 0
108
- doc=Hpricot('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
109
- doc.traverse_all_element do |e|
110
- unless e.is_a?Hpricot::Text
108
+ doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
109
+ doc.traverse do |e|
110
+ if e.element?
111
111
  e.name='xhtml:'+e.name
112
- e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
113
112
  end
114
113
  end
115
114
  FBPost.push(conf,'',doc.to_html,"Text",path)
@@ -1,5 +1,5 @@
1
- require 'hpricot'
2
- require 'document.rb'
1
+ require 'nokogiri'
2
+ require 'html2fb/document.rb'
3
3
  require 'progressbar'
4
4
  #require 'ruby-prof'
5
5
  #require 'term/ansicolor'
@@ -14,11 +14,11 @@ module HTML2FB
14
14
 
15
15
  def parse(txt)
16
16
  puts "Parsing HTML"
17
- pdoc=Hpricot(txt)
17
+ pdoc=Nokogiri::HTML(txt)
18
18
  if @conf['conv']
19
19
  mc=pdoc/'meta[@http-equiv="Content-Type"]'
20
20
  if mc.size>0
21
- charset=mc.first.attributes['content'].split(';').find do |s|
21
+ charset=mc.first.attributes['content'].to_s.split(';').find do |s|
22
22
  s.strip[0,7]=='charset'
23
23
  end
24
24
  unless charset.nil?
@@ -28,7 +28,7 @@ module HTML2FB
28
28
  unless tc.nil?
29
29
  puts "Trying to convert source encoding from #{tc} to utf-8"
30
30
  require 'iconv'
31
- pdoc=Hpricot(Iconv.conv('utf-8',tc.downcase,txt))
31
+ pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
32
32
 
33
33
  end
34
34
 
@@ -38,7 +38,7 @@ module HTML2FB
38
38
  puts "Removing garbage elements"
39
39
  remove_objs(pdoc)
40
40
  ti=pdoc.at('title')
41
- doc.title= ti.extract_text.strip unless ti.nil?
41
+ doc.title= ti.text.strip unless ti.nil?
42
42
  # pdoc.search('//h3').each do |e|
43
43
  # doc.content.push(e.inner_text)
44
44
  # end
@@ -58,10 +58,10 @@ module HTML2FB
58
58
  doc.search('.'+cl).remove
59
59
  end unless @conf['remove']['class'].nil?
60
60
  @conf['remove']['expr'].each do |cl|
61
- doc.search(cl).remove
61
+ doc.search(cl).remove rescue doc.xpath(cl).remove
62
62
  end unless @conf['remove']['expr'].nil?
63
63
  @conf['remove']['before'].each do |cl|
64
- x=doc.at(cl)
64
+ x=doc.at(cl) rescue doc.at_xpath(cl)
65
65
  if x
66
66
  x.preceding.remove
67
67
  x.parent.children.delete(x)
@@ -73,7 +73,7 @@ module HTML2FB
73
73
  t.remove unless t.nil?
74
74
  end unless @conf['remove']['between'].nil?
75
75
  @conf['remove']['after'].each do |cl|
76
- x=doc.at(cl)
76
+ x=doc.at(cl) rescue doc.at_xpath(cl)
77
77
  if x
78
78
  x.following.remove
79
79
  x.parent.children.delete(x)
@@ -89,13 +89,13 @@ module HTML2FB
89
89
 
90
90
  aut=build_autom(@conf['select'],ret)
91
91
 
92
- pbar = ProgressBar.new("Parsing", doc.search('//').size)
93
- doc.traverse_all_element do |el|
92
+ pbar = ProgressBar.new("Parsing", doc.search('//*').size)
93
+ doc.traverse do |el|
94
94
  aut.feed(el)
95
95
  pbar.inc
96
96
  end
97
- pbar.finish
98
97
  aut.finish(doc)
98
+ pbar.finish
99
99
  =begin
100
100
  result = RubyProf.stop
101
101
  printer = RubyProf::FlatPrinter.new(result)
@@ -180,10 +180,10 @@ module HTML2FB
180
180
  if @content=='body'
181
181
  tmp=el.preceding[0..-1]
182
182
  else
183
- tmp=el.root.search(@content...(el.xpath))[1..-1]
183
+ tmp=el.root.between(@content,(el.path),true)[1..-1]
184
184
  end
185
185
  if tmp.blank? #search can'find between siblins
186
- tmp=el.root.deep_between(@content,(el.xpath))
186
+ tmp=el.root.deep_between(@content,(el.path))
187
187
  end
188
188
  unless tmp.blank?
189
189
  tmph=tmp.to_html
@@ -195,7 +195,7 @@ module HTML2FB
195
195
  (lvl..@max_level).to_a.reverse.each do |l|
196
196
  close_section(l)
197
197
  end
198
- @starts[lvl]=create_fbsection(el.root.at(obj[:xpath]).extract_text,obj[:fblevel])
198
+ @starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
199
199
  @content=obj[:xpath]
200
200
  @current_level=lvl
201
201
  end
@@ -209,7 +209,7 @@ module HTML2FB
209
209
  end
210
210
 
211
211
  def feed(el)
212
- return if el.is_a?Hpricot::Text
212
+ return if el.text?
213
213
  @done=[[]*@levels.size]
214
214
 
215
215
  @levels.each_with_index do |lvl,i|
@@ -218,7 +218,7 @@ module HTML2FB
218
218
  if el.in_search?(expr['expr'])
219
219
 
220
220
 
221
- open_section({:xpath => el.xpath, :fblevel => expr['fblevel']},i+1,el)
221
+ open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
222
222
  break
223
223
  end
224
224
  end
@@ -228,6 +228,9 @@ module HTML2FB
228
228
  end
229
229
  end
230
230
 
231
+ class Nokogiri::XML::NodeSet
232
+ alias :blank? :empty?
233
+ end
231
234
 
232
235
  class String
233
236
  def blank?
@@ -241,17 +244,22 @@ class NilClass
241
244
  end
242
245
  end
243
246
 
244
- module Hpricot::Traverse
247
+
248
+
249
+ class Nokogiri::XML::Node
250
+
245
251
  def in_search?(expr)
246
252
  if expr !~ /[^a-z0-9]/
247
253
  return self.name.downcase()==expr.downcase()
248
254
  end
249
255
 
250
- se_in=self.parent
256
+ se_in=self.root
257
+ se_in=self.parent if self.respond_to?(:parent)
251
258
  if expr[0..1]=='/'
252
259
  se_in=self.root
253
260
  end
254
- se_in.search(expr).each do |el|
261
+ set=se_in.search(expr) rescue se_in.xpath(expr)
262
+ set.each do |el|
255
263
  return true if el==self
256
264
  end
257
265
  # puts self.name+" "+expr
@@ -259,34 +267,60 @@ module Hpricot::Traverse
259
267
  end
260
268
 
261
269
  def root
262
- return @root unless @root.nil?
263
- se_in=self
264
- se_in=se_in.parent until se_in.parent.nil?
265
- @root=se_in
266
- se_in
270
+ self.document.root
267
271
  end
268
272
 
269
- def between(a,b)
270
- root.search(a..b)
273
+ def node_position
274
+ return @node_position if @node_position
275
+ @node_position=parent.children.index(self)
271
276
  end
272
277
 
273
- def extract_text
274
- t=''
275
- self.traverse_all_element do |e|
276
- t+=e.content.to_s if e.is_a?(Hpricot::Text)
278
+ def between(a,b,excl=false)
279
+
280
+ #from nokogiri
281
+ offset=(excl ? -1 : 0)
282
+ ary = []
283
+ ele1=at(a) rescue at_xpath(a)
284
+ ele2=at(b) rescue at_xpath(b)
285
+
286
+ if ele1 and ele2
287
+ # let's quickly take care of siblings
288
+ if ele1.parent == ele2.parent
289
+
290
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
291
+ else
292
+ # find common parent
293
+ ele1_p=ele1.ancestors
294
+ ele2_p=ele2.ancestors
295
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
296
+
297
+ child = nil
298
+ if ele1 == common_parent
299
+ child = ele2
300
+ elsif ele2 == common_parent
301
+ child = ele1
302
+ end
303
+
304
+ if child
305
+ ary = common_parent.children[0..(child.node_position+offset)]
306
+ end
307
+ end
277
308
  end
278
- t
309
+
310
+ return Nokogiri::XML::NodeSet.new(ele1.document,ary)
279
311
  end
280
- def deep_between(i,j)
281
312
 
282
- unless j.nil? || self.at(j).nil?
283
- tm=self.at(i)
313
+
314
+
315
+ def deep_between(i,j)
316
+ unless j.nil? || self.at_xpath(j).nil?
317
+ tm=self.at_xpath(i)
284
318
  prec=tm.deep_preceding
285
- r=Hpricot::Elements[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}]
319
+ r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
286
320
  else
287
321
  r=self.at(i).deep_following unless self.at(i).nil?
288
322
  end
289
- Hpricot::Elements[*select_end(r,i)]
323
+ Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
290
324
  end
291
325
 
292
326
  def select_end(tab,expr)
@@ -296,13 +330,15 @@ module Hpricot::Traverse
296
330
  idx=-1
297
331
  i=0
298
332
  tab.each do |e|
299
- if e.search(expr.gsub(e.xpath,'.')).size > 0
333
+ nxp=expr.gsub(e.path,'.')
334
+ set=e.search(nxp) rescue e.xpath(nxp)
335
+ if set.size > 0
300
336
  idx=i
301
337
  #if e.search(i).size > 0
302
- if e.children.find{|ee| ee.xpath==expr }
338
+ if e.children.find{|ee| ee.path==expr }
303
339
  e.children.each do |ee|
304
340
  s << ee if f
305
- f=true if ee.xpath==expr
341
+ f=true if ee.path==expr
306
342
  end
307
343
  else
308
344
  s=select_end(e.children,expr)
@@ -316,20 +352,24 @@ module Hpricot::Traverse
316
352
  return s+tab[(idx+1)..-1]
317
353
  end
318
354
 
355
+ def preceding
356
+ self.parent.children[0...node_position]
357
+ end
358
+
359
+ def following
360
+ self.parent.children[node_position+1..-1]
361
+ end
362
+
319
363
  def deep_preceding()
320
- ret=Hpricot::Elements[]
321
- ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
364
+ ret=Nokogiri::XML::NodeSet.new(self.document,[])
365
+ ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
322
366
  ret+=preceding
323
- Hpricot::Elements[*ret]
367
+ ret
324
368
  end
325
369
  def deep_following()
326
370
  ret=following
327
- ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Hpricot::Doc )
328
- Hpricot::Elements[*ret]
371
+ ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
372
+ ret
329
373
  end
330
374
 
331
375
  end
332
-
333
- class Hpricot::Elements
334
- alias_method :blank?, :empty?
335
- end
@@ -0,0 +1,3 @@
1
+ module Html2fb
2
+ VERSION = "1.3.1"
3
+ end
data/lib/html2fb.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "html2fb/version"
2
+
3
+ require 'html2fb/conf.rb'
4
+ require 'html2fb/downloader.rb'
5
+ require 'html2fb/document.rb'
6
+ require 'html2fb/parser.rb'
7
+ require 'html2fb/feedbooks.rb'