Html2Feedbooks 1.1.1 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +1 -0
- data/bin/html2fb.rb +4 -7
- data/html2fb.gemspec +27 -0
- data/lib/{app.rb → html2fb/app.rb} +8 -9
- data/lib/{conf.rb → html2fb/conf.rb} +0 -0
- data/lib/{document.rb → html2fb/document.rb} +0 -0
- data/lib/{downloader.rb → html2fb/downloader.rb} +0 -0
- data/lib/{feedbooks.rb → html2fb/feedbooks.rb} +5 -6
- data/lib/{parser.rb → html2fb/parser.rb} +89 -49
- data/lib/html2fb/version.rb +3 -0
- data/lib/html2fb.rb +7 -0
- data/samples/107-h.htm +19642 -0
- data/samples/107-h2.htm +1259 -0
- data/samples/3049-h.htm +7787 -0
- data/samples/3058-h.htm +8732 -0
- data/samples/3258-h.htm +19894 -0
- data/samples/3258-h2.htm +686 -0
- data/samples/3469-h.htm +14024 -0
- data/samples/conf107-h.yml +27 -0
- data/samples/conf3049-h.yml +27 -0
- data/samples/conf3058-h.yml +27 -0
- data/samples/conf3258-h.yml +26 -0
- data/samples/conf3469-h.yml +32 -0
- metadata +41 -21
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/html2fb.rb
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
2
3
|
require 'optparse'
|
3
4
|
require 'open-uri'
|
4
|
-
require 'conf.rb'
|
5
|
-
require 'downloader.rb'
|
6
|
-
require 'document.rb'
|
7
|
-
require 'parser.rb'
|
8
|
-
require 'feedbooks.rb'
|
9
5
|
require 'tmpdir'
|
10
6
|
require 'launchy'
|
11
7
|
require 'digest/md5'
|
8
|
+
require 'lib/html2fb'
|
12
9
|
|
13
10
|
include HTML2FB
|
14
11
|
|
@@ -86,7 +83,7 @@ if options[:preview]
|
|
86
83
|
puts "A preview of the parsed file should be opening in your webbrowser now"
|
87
84
|
puts "If nothing open you can open the file located at : #{page}"
|
88
85
|
puts "When happy with the parsed output rerun with -s option to send to Feedbooks.com"
|
89
|
-
Launchy
|
86
|
+
Launchy.open(page)
|
90
87
|
else
|
91
88
|
doc.to_feedbooks(conf)
|
92
89
|
end
|
data/html2fb.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "html2fb/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "Html2Feedbooks"
|
7
|
+
s.version = Html2fb::VERSION
|
8
|
+
s.authors = ["Benoit Larroque"]
|
9
|
+
s.email = ["benoit dot larroque at feedbooks dot com"]
|
10
|
+
s.summary = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
|
11
|
+
s.homepage = %q{http://github.com/zetaben/Html2Feedbooks}
|
12
|
+
s.description = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.default_executable = 'html2fb.rb'
|
19
|
+
s.add_dependency('nokogiri','>=1.4.1')
|
20
|
+
s.add_dependency('htmlentities', '>= 4.2.1')
|
21
|
+
s.add_dependency('launchy', '>= 2.0.0')
|
22
|
+
s.add_dependency('progressbar', '>= 0.0.3')
|
23
|
+
|
24
|
+
# specify any dependencies here; for example:
|
25
|
+
# s.add_development_dependency "rspec"
|
26
|
+
# s.add_runtime_dependency "rest-client"
|
27
|
+
end
|
@@ -34,8 +34,8 @@ class AtomPost
|
|
34
34
|
req = Net::HTTP::Get.new(url.path)
|
35
35
|
req.basic_auth user,pass unless user.nil?
|
36
36
|
response = http.request(req)
|
37
|
-
doc=
|
38
|
-
e=doc.at('//entry
|
37
|
+
doc=Nokogiri::XML(response.body).remove_namespaces!
|
38
|
+
e=doc.at('//entry/link[@rel="down"]')
|
39
39
|
return URI.parse(e[:href]).path unless e.nil?
|
40
40
|
}
|
41
41
|
end
|
@@ -47,7 +47,6 @@ class AtomPost
|
|
47
47
|
#STDERR.puts "sending to #{url}"
|
48
48
|
req = Net::HTTP::Post.new(url.path)
|
49
49
|
req.basic_auth user,pass unless user.nil?
|
50
|
-
|
51
50
|
req.body = '<?xml version="1.0"?>'+"\n"
|
52
51
|
req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
|
53
52
|
req.body +='<title>'+decode_text(title)+'</title>'+"\n"
|
@@ -77,9 +76,9 @@ class AtomPost
|
|
77
76
|
|
78
77
|
def recode_text(txt)
|
79
78
|
return txt if txt.blank?
|
80
|
-
m=
|
81
|
-
m.
|
82
|
-
m.
|
79
|
+
m=Nokogiri::XML("<text>#{txt}</text>")
|
80
|
+
m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
|
81
|
+
m.root.inner_html
|
83
82
|
end
|
84
83
|
HTMLENCODER=HTMLEntities.new
|
85
84
|
def force_decimal_entities(txt)
|
@@ -88,9 +87,9 @@ class AtomPost
|
|
88
87
|
|
89
88
|
def decode_text(txt)
|
90
89
|
return txt if txt.blank?
|
91
|
-
m=
|
92
|
-
m.
|
93
|
-
m.
|
90
|
+
m=Nokogiri::XML("<text>#{txt}</text>")
|
91
|
+
m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
|
92
|
+
m.root.inner_html
|
94
93
|
end
|
95
94
|
|
96
95
|
end
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'app.rb'
|
2
|
-
require '
|
1
|
+
require 'html2fb/app.rb'
|
2
|
+
require 'nokogiri'
|
3
3
|
require 'digest/md5'
|
4
4
|
|
5
5
|
module HTML2FB
|
@@ -105,11 +105,10 @@ module HTML2FB
|
|
105
105
|
def to_feedbooks(conf,path=nil)
|
106
106
|
stxt=to_html
|
107
107
|
return unless stxt.strip.size > 0
|
108
|
-
doc=
|
109
|
-
doc.
|
110
|
-
|
108
|
+
doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
|
109
|
+
doc.traverse do |e|
|
110
|
+
if e.element?
|
111
111
|
e.name='xhtml:'+e.name
|
112
|
-
e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
|
113
112
|
end
|
114
113
|
end
|
115
114
|
FBPost.push(conf,'',doc.to_html,"Text",path)
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require '
|
2
|
-
require 'document.rb'
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'html2fb/document.rb'
|
3
3
|
require 'progressbar'
|
4
4
|
#require 'ruby-prof'
|
5
5
|
#require 'term/ansicolor'
|
@@ -14,11 +14,11 @@ module HTML2FB
|
|
14
14
|
|
15
15
|
def parse(txt)
|
16
16
|
puts "Parsing HTML"
|
17
|
-
pdoc=
|
17
|
+
pdoc=Nokogiri::HTML(txt)
|
18
18
|
if @conf['conv']
|
19
19
|
mc=pdoc/'meta[@http-equiv="Content-Type"]'
|
20
20
|
if mc.size>0
|
21
|
-
charset=mc.first.attributes['content'].split(';').find do |s|
|
21
|
+
charset=mc.first.attributes['content'].to_s.split(';').find do |s|
|
22
22
|
s.strip[0,7]=='charset'
|
23
23
|
end
|
24
24
|
unless charset.nil?
|
@@ -28,7 +28,7 @@ module HTML2FB
|
|
28
28
|
unless tc.nil?
|
29
29
|
puts "Trying to convert source encoding from #{tc} to utf-8"
|
30
30
|
require 'iconv'
|
31
|
-
pdoc=
|
31
|
+
pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
|
32
32
|
|
33
33
|
end
|
34
34
|
|
@@ -38,7 +38,7 @@ module HTML2FB
|
|
38
38
|
puts "Removing garbage elements"
|
39
39
|
remove_objs(pdoc)
|
40
40
|
ti=pdoc.at('title')
|
41
|
-
doc.title= ti.
|
41
|
+
doc.title= ti.text.strip unless ti.nil?
|
42
42
|
# pdoc.search('//h3').each do |e|
|
43
43
|
# doc.content.push(e.inner_text)
|
44
44
|
# end
|
@@ -58,10 +58,10 @@ module HTML2FB
|
|
58
58
|
doc.search('.'+cl).remove
|
59
59
|
end unless @conf['remove']['class'].nil?
|
60
60
|
@conf['remove']['expr'].each do |cl|
|
61
|
-
doc.search(cl).remove
|
61
|
+
doc.search(cl).remove rescue doc.xpath(cl).remove
|
62
62
|
end unless @conf['remove']['expr'].nil?
|
63
63
|
@conf['remove']['before'].each do |cl|
|
64
|
-
x=doc.at(cl)
|
64
|
+
x=doc.at(cl) rescue doc.at_xpath(cl)
|
65
65
|
if x
|
66
66
|
x.preceding.remove
|
67
67
|
x.parent.children.delete(x)
|
@@ -73,7 +73,7 @@ module HTML2FB
|
|
73
73
|
t.remove unless t.nil?
|
74
74
|
end unless @conf['remove']['between'].nil?
|
75
75
|
@conf['remove']['after'].each do |cl|
|
76
|
-
x=doc.at(cl)
|
76
|
+
x=doc.at(cl) rescue doc.at_xpath(cl)
|
77
77
|
if x
|
78
78
|
x.following.remove
|
79
79
|
x.parent.children.delete(x)
|
@@ -89,13 +89,13 @@ module HTML2FB
|
|
89
89
|
|
90
90
|
aut=build_autom(@conf['select'],ret)
|
91
91
|
|
92
|
-
pbar = ProgressBar.new("Parsing", doc.search('
|
93
|
-
doc.
|
92
|
+
pbar = ProgressBar.new("Parsing", doc.search('//*').size)
|
93
|
+
doc.traverse do |el|
|
94
94
|
aut.feed(el)
|
95
95
|
pbar.inc
|
96
96
|
end
|
97
|
-
pbar.finish
|
98
97
|
aut.finish(doc)
|
98
|
+
pbar.finish
|
99
99
|
=begin
|
100
100
|
result = RubyProf.stop
|
101
101
|
printer = RubyProf::FlatPrinter.new(result)
|
@@ -180,10 +180,10 @@ module HTML2FB
|
|
180
180
|
if @content=='body'
|
181
181
|
tmp=el.preceding[0..-1]
|
182
182
|
else
|
183
|
-
tmp=el.root.
|
183
|
+
tmp=el.root.between(@content,(el.path),true)[1..-1]
|
184
184
|
end
|
185
185
|
if tmp.blank? #search can'find between siblins
|
186
|
-
tmp=el.root.deep_between(@content,(el.
|
186
|
+
tmp=el.root.deep_between(@content,(el.path))
|
187
187
|
end
|
188
188
|
unless tmp.blank?
|
189
189
|
tmph=tmp.to_html
|
@@ -195,7 +195,7 @@ module HTML2FB
|
|
195
195
|
(lvl..@max_level).to_a.reverse.each do |l|
|
196
196
|
close_section(l)
|
197
197
|
end
|
198
|
-
@starts[lvl]=create_fbsection(el.root.
|
198
|
+
@starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
|
199
199
|
@content=obj[:xpath]
|
200
200
|
@current_level=lvl
|
201
201
|
end
|
@@ -209,7 +209,7 @@ module HTML2FB
|
|
209
209
|
end
|
210
210
|
|
211
211
|
def feed(el)
|
212
|
-
return if el.
|
212
|
+
return if el.text?
|
213
213
|
@done=[[]*@levels.size]
|
214
214
|
|
215
215
|
@levels.each_with_index do |lvl,i|
|
@@ -218,7 +218,7 @@ module HTML2FB
|
|
218
218
|
if el.in_search?(expr['expr'])
|
219
219
|
|
220
220
|
|
221
|
-
open_section({:xpath => el.
|
221
|
+
open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
|
222
222
|
break
|
223
223
|
end
|
224
224
|
end
|
@@ -228,6 +228,9 @@ module HTML2FB
|
|
228
228
|
end
|
229
229
|
end
|
230
230
|
|
231
|
+
class Nokogiri::XML::NodeSet
|
232
|
+
alias :blank? :empty?
|
233
|
+
end
|
231
234
|
|
232
235
|
class String
|
233
236
|
def blank?
|
@@ -241,17 +244,22 @@ class NilClass
|
|
241
244
|
end
|
242
245
|
end
|
243
246
|
|
244
|
-
|
247
|
+
|
248
|
+
|
249
|
+
class Nokogiri::XML::Node
|
250
|
+
|
245
251
|
def in_search?(expr)
|
246
252
|
if expr !~ /[^a-z0-9]/
|
247
253
|
return self.name.downcase()==expr.downcase()
|
248
254
|
end
|
249
255
|
|
250
|
-
se_in=self.
|
256
|
+
se_in=self.root
|
257
|
+
se_in=self.parent if self.respond_to?(:parent)
|
251
258
|
if expr[0..1]=='/'
|
252
259
|
se_in=self.root
|
253
260
|
end
|
254
|
-
se_in.search(expr)
|
261
|
+
set=se_in.search(expr) rescue se_in.xpath(expr)
|
262
|
+
set.each do |el|
|
255
263
|
return true if el==self
|
256
264
|
end
|
257
265
|
# puts self.name+" "+expr
|
@@ -259,34 +267,60 @@ module Hpricot::Traverse
|
|
259
267
|
end
|
260
268
|
|
261
269
|
def root
|
262
|
-
|
263
|
-
se_in=self
|
264
|
-
se_in=se_in.parent until se_in.parent.nil?
|
265
|
-
@root=se_in
|
266
|
-
se_in
|
270
|
+
self.document.root
|
267
271
|
end
|
268
272
|
|
269
|
-
def
|
270
|
-
|
273
|
+
def node_position
|
274
|
+
return @node_position if @node_position
|
275
|
+
@node_position=parent.children.index(self)
|
271
276
|
end
|
272
277
|
|
273
|
-
def
|
274
|
-
|
275
|
-
|
276
|
-
|
278
|
+
def between(a,b,excl=false)
|
279
|
+
|
280
|
+
#from nokogiri
|
281
|
+
offset=(excl ? -1 : 0)
|
282
|
+
ary = []
|
283
|
+
ele1=at(a) rescue at_xpath(a)
|
284
|
+
ele2=at(b) rescue at_xpath(b)
|
285
|
+
|
286
|
+
if ele1 and ele2
|
287
|
+
# let's quickly take care of siblings
|
288
|
+
if ele1.parent == ele2.parent
|
289
|
+
|
290
|
+
ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
|
291
|
+
else
|
292
|
+
# find common parent
|
293
|
+
ele1_p=ele1.ancestors
|
294
|
+
ele2_p=ele2.ancestors
|
295
|
+
common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
|
296
|
+
|
297
|
+
child = nil
|
298
|
+
if ele1 == common_parent
|
299
|
+
child = ele2
|
300
|
+
elsif ele2 == common_parent
|
301
|
+
child = ele1
|
302
|
+
end
|
303
|
+
|
304
|
+
if child
|
305
|
+
ary = common_parent.children[0..(child.node_position+offset)]
|
306
|
+
end
|
307
|
+
end
|
277
308
|
end
|
278
|
-
|
309
|
+
|
310
|
+
return Nokogiri::XML::NodeSet.new(ele1.document,ary)
|
279
311
|
end
|
280
|
-
def deep_between(i,j)
|
281
312
|
|
282
|
-
|
283
|
-
|
313
|
+
|
314
|
+
|
315
|
+
def deep_between(i,j)
|
316
|
+
unless j.nil? || self.at_xpath(j).nil?
|
317
|
+
tm=self.at_xpath(i)
|
284
318
|
prec=tm.deep_preceding
|
285
|
-
r=
|
319
|
+
r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
|
286
320
|
else
|
287
321
|
r=self.at(i).deep_following unless self.at(i).nil?
|
288
322
|
end
|
289
|
-
|
323
|
+
Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
|
290
324
|
end
|
291
325
|
|
292
326
|
def select_end(tab,expr)
|
@@ -296,13 +330,15 @@ module Hpricot::Traverse
|
|
296
330
|
idx=-1
|
297
331
|
i=0
|
298
332
|
tab.each do |e|
|
299
|
-
|
333
|
+
nxp=expr.gsub(e.path,'.')
|
334
|
+
set=e.search(nxp) rescue e.xpath(nxp)
|
335
|
+
if set.size > 0
|
300
336
|
idx=i
|
301
337
|
#if e.search(i).size > 0
|
302
|
-
if e.children.find{|ee| ee.
|
338
|
+
if e.children.find{|ee| ee.path==expr }
|
303
339
|
e.children.each do |ee|
|
304
340
|
s << ee if f
|
305
|
-
f=true if ee.
|
341
|
+
f=true if ee.path==expr
|
306
342
|
end
|
307
343
|
else
|
308
344
|
s=select_end(e.children,expr)
|
@@ -316,20 +352,24 @@ module Hpricot::Traverse
|
|
316
352
|
return s+tab[(idx+1)..-1]
|
317
353
|
end
|
318
354
|
|
355
|
+
def preceding
|
356
|
+
self.parent.children[0...node_position]
|
357
|
+
end
|
358
|
+
|
359
|
+
def following
|
360
|
+
self.parent.children[node_position+1..-1]
|
361
|
+
end
|
362
|
+
|
319
363
|
def deep_preceding()
|
320
|
-
ret=
|
321
|
-
ret+=parent.deep_preceding if respond_to?(:parent)
|
364
|
+
ret=Nokogiri::XML::NodeSet.new(self.document,[])
|
365
|
+
ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
|
322
366
|
ret+=preceding
|
323
|
-
|
367
|
+
ret
|
324
368
|
end
|
325
369
|
def deep_following()
|
326
370
|
ret=following
|
327
|
-
ret+=parent.deep_following if respond_to?(:parent)
|
328
|
-
|
371
|
+
ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
|
372
|
+
ret
|
329
373
|
end
|
330
374
|
|
331
375
|
end
|
332
|
-
|
333
|
-
class Hpricot::Elements
|
334
|
-
alias_method :blank?, :empty?
|
335
|
-
end
|