Html2Feedbooks 1.1.1 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +1 -0
- data/bin/html2fb.rb +4 -7
- data/html2fb.gemspec +27 -0
- data/lib/{app.rb → html2fb/app.rb} +8 -9
- data/lib/{conf.rb → html2fb/conf.rb} +0 -0
- data/lib/{document.rb → html2fb/document.rb} +0 -0
- data/lib/{downloader.rb → html2fb/downloader.rb} +0 -0
- data/lib/{feedbooks.rb → html2fb/feedbooks.rb} +5 -6
- data/lib/{parser.rb → html2fb/parser.rb} +89 -49
- data/lib/html2fb/version.rb +3 -0
- data/lib/html2fb.rb +7 -0
- data/samples/107-h.htm +19642 -0
- data/samples/107-h2.htm +1259 -0
- data/samples/3049-h.htm +7787 -0
- data/samples/3058-h.htm +8732 -0
- data/samples/3258-h.htm +19894 -0
- data/samples/3258-h2.htm +686 -0
- data/samples/3469-h.htm +14024 -0
- data/samples/conf107-h.yml +27 -0
- data/samples/conf3049-h.yml +27 -0
- data/samples/conf3058-h.yml +27 -0
- data/samples/conf3258-h.yml +26 -0
- data/samples/conf3469-h.yml +32 -0
- metadata +41 -21
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/html2fb.rb
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
2
3
|
require 'optparse'
|
3
4
|
require 'open-uri'
|
4
|
-
require 'conf.rb'
|
5
|
-
require 'downloader.rb'
|
6
|
-
require 'document.rb'
|
7
|
-
require 'parser.rb'
|
8
|
-
require 'feedbooks.rb'
|
9
5
|
require 'tmpdir'
|
10
6
|
require 'launchy'
|
11
7
|
require 'digest/md5'
|
8
|
+
require 'lib/html2fb'
|
12
9
|
|
13
10
|
include HTML2FB
|
14
11
|
|
@@ -86,7 +83,7 @@ if options[:preview]
|
|
86
83
|
puts "A preview of the parsed file should be opening in your webbrowser now"
|
87
84
|
puts "If nothing open you can open the file located at : #{page}"
|
88
85
|
puts "When happy with the parsed output rerun with -s option to send to Feedbooks.com"
|
89
|
-
Launchy
|
86
|
+
Launchy.open(page)
|
90
87
|
else
|
91
88
|
doc.to_feedbooks(conf)
|
92
89
|
end
|
data/html2fb.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "html2fb/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "Html2Feedbooks"
|
7
|
+
s.version = Html2fb::VERSION
|
8
|
+
s.authors = ["Benoit Larroque"]
|
9
|
+
s.email = ["benoit dot larroque at feedbooks dot com"]
|
10
|
+
s.summary = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
|
11
|
+
s.homepage = %q{http://github.com/zetaben/Html2Feedbooks}
|
12
|
+
s.description = %q{Html2Feedbooks is script to automate basic publishing on feedbooks.com}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.default_executable = 'html2fb.rb'
|
19
|
+
s.add_dependency('nokogiri','>=1.4.1')
|
20
|
+
s.add_dependency('htmlentities', '>= 4.2.1')
|
21
|
+
s.add_dependency('launchy', '>= 2.0.0')
|
22
|
+
s.add_dependency('progressbar', '>= 0.0.3')
|
23
|
+
|
24
|
+
# specify any dependencies here; for example:
|
25
|
+
# s.add_development_dependency "rspec"
|
26
|
+
# s.add_runtime_dependency "rest-client"
|
27
|
+
end
|
@@ -34,8 +34,8 @@ class AtomPost
|
|
34
34
|
req = Net::HTTP::Get.new(url.path)
|
35
35
|
req.basic_auth user,pass unless user.nil?
|
36
36
|
response = http.request(req)
|
37
|
-
doc=
|
38
|
-
e=doc.at('//entry
|
37
|
+
doc=Nokogiri::XML(response.body).remove_namespaces!
|
38
|
+
e=doc.at('//entry/link[@rel="down"]')
|
39
39
|
return URI.parse(e[:href]).path unless e.nil?
|
40
40
|
}
|
41
41
|
end
|
@@ -47,7 +47,6 @@ class AtomPost
|
|
47
47
|
#STDERR.puts "sending to #{url}"
|
48
48
|
req = Net::HTTP::Post.new(url.path)
|
49
49
|
req.basic_auth user,pass unless user.nil?
|
50
|
-
|
51
50
|
req.body = '<?xml version="1.0"?>'+"\n"
|
52
51
|
req.body +='<entry xmlns="http://www.w3.org/2005/Atom">'+"\n"
|
53
52
|
req.body +='<title>'+decode_text(title)+'</title>'+"\n"
|
@@ -77,9 +76,9 @@ class AtomPost
|
|
77
76
|
|
78
77
|
def recode_text(txt)
|
79
78
|
return txt if txt.blank?
|
80
|
-
m=
|
81
|
-
m.
|
82
|
-
m.
|
79
|
+
m=Nokogiri::XML("<text>#{txt}</text>")
|
80
|
+
m.traverse{|t| next unless t.text?;t.text=force_decimal_entities(t.text) if t.text.match(/&[a-z][a-z0-9]+;/i)}
|
81
|
+
m.root.inner_html
|
83
82
|
end
|
84
83
|
HTMLENCODER=HTMLEntities.new
|
85
84
|
def force_decimal_entities(txt)
|
@@ -88,9 +87,9 @@ class AtomPost
|
|
88
87
|
|
89
88
|
def decode_text(txt)
|
90
89
|
return txt if txt.blank?
|
91
|
-
m=
|
92
|
-
m.
|
93
|
-
m.
|
90
|
+
m=Nokogiri::XML("<text>#{txt}</text>")
|
91
|
+
m.traverse{|t| next unless t.text?; HTMLENCODER.decode(t.text)}
|
92
|
+
m.root.inner_html
|
94
93
|
end
|
95
94
|
|
96
95
|
end
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'app.rb'
|
2
|
-
require '
|
1
|
+
require 'html2fb/app.rb'
|
2
|
+
require 'nokogiri'
|
3
3
|
require 'digest/md5'
|
4
4
|
|
5
5
|
module HTML2FB
|
@@ -105,11 +105,10 @@ module HTML2FB
|
|
105
105
|
def to_feedbooks(conf,path=nil)
|
106
106
|
stxt=to_html
|
107
107
|
return unless stxt.strip.size > 0
|
108
|
-
doc=
|
109
|
-
doc.
|
110
|
-
|
108
|
+
doc=Nokogiri::XML('<div xmlns:xhtml="http://www.w3.org/1999/xhtml">'+stxt+'</div>')
|
109
|
+
doc.traverse do |e|
|
110
|
+
if e.element?
|
111
111
|
e.name='xhtml:'+e.name
|
112
|
-
e.etag='xhtml:'+e.etag unless (!e.respond_to?:etag) || e.etag.nil?
|
113
112
|
end
|
114
113
|
end
|
115
114
|
FBPost.push(conf,'',doc.to_html,"Text",path)
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require '
|
2
|
-
require 'document.rb'
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'html2fb/document.rb'
|
3
3
|
require 'progressbar'
|
4
4
|
#require 'ruby-prof'
|
5
5
|
#require 'term/ansicolor'
|
@@ -14,11 +14,11 @@ module HTML2FB
|
|
14
14
|
|
15
15
|
def parse(txt)
|
16
16
|
puts "Parsing HTML"
|
17
|
-
pdoc=
|
17
|
+
pdoc=Nokogiri::HTML(txt)
|
18
18
|
if @conf['conv']
|
19
19
|
mc=pdoc/'meta[@http-equiv="Content-Type"]'
|
20
20
|
if mc.size>0
|
21
|
-
charset=mc.first.attributes['content'].split(';').find do |s|
|
21
|
+
charset=mc.first.attributes['content'].to_s.split(';').find do |s|
|
22
22
|
s.strip[0,7]=='charset'
|
23
23
|
end
|
24
24
|
unless charset.nil?
|
@@ -28,7 +28,7 @@ module HTML2FB
|
|
28
28
|
unless tc.nil?
|
29
29
|
puts "Trying to convert source encoding from #{tc} to utf-8"
|
30
30
|
require 'iconv'
|
31
|
-
pdoc=
|
31
|
+
pdoc=Nokogiri::HTML(Iconv.conv('utf-8',tc.downcase,txt))
|
32
32
|
|
33
33
|
end
|
34
34
|
|
@@ -38,7 +38,7 @@ module HTML2FB
|
|
38
38
|
puts "Removing garbage elements"
|
39
39
|
remove_objs(pdoc)
|
40
40
|
ti=pdoc.at('title')
|
41
|
-
doc.title= ti.
|
41
|
+
doc.title= ti.text.strip unless ti.nil?
|
42
42
|
# pdoc.search('//h3').each do |e|
|
43
43
|
# doc.content.push(e.inner_text)
|
44
44
|
# end
|
@@ -58,10 +58,10 @@ module HTML2FB
|
|
58
58
|
doc.search('.'+cl).remove
|
59
59
|
end unless @conf['remove']['class'].nil?
|
60
60
|
@conf['remove']['expr'].each do |cl|
|
61
|
-
doc.search(cl).remove
|
61
|
+
doc.search(cl).remove rescue doc.xpath(cl).remove
|
62
62
|
end unless @conf['remove']['expr'].nil?
|
63
63
|
@conf['remove']['before'].each do |cl|
|
64
|
-
x=doc.at(cl)
|
64
|
+
x=doc.at(cl) rescue doc.at_xpath(cl)
|
65
65
|
if x
|
66
66
|
x.preceding.remove
|
67
67
|
x.parent.children.delete(x)
|
@@ -73,7 +73,7 @@ module HTML2FB
|
|
73
73
|
t.remove unless t.nil?
|
74
74
|
end unless @conf['remove']['between'].nil?
|
75
75
|
@conf['remove']['after'].each do |cl|
|
76
|
-
x=doc.at(cl)
|
76
|
+
x=doc.at(cl) rescue doc.at_xpath(cl)
|
77
77
|
if x
|
78
78
|
x.following.remove
|
79
79
|
x.parent.children.delete(x)
|
@@ -89,13 +89,13 @@ module HTML2FB
|
|
89
89
|
|
90
90
|
aut=build_autom(@conf['select'],ret)
|
91
91
|
|
92
|
-
pbar = ProgressBar.new("Parsing", doc.search('
|
93
|
-
doc.
|
92
|
+
pbar = ProgressBar.new("Parsing", doc.search('//*').size)
|
93
|
+
doc.traverse do |el|
|
94
94
|
aut.feed(el)
|
95
95
|
pbar.inc
|
96
96
|
end
|
97
|
-
pbar.finish
|
98
97
|
aut.finish(doc)
|
98
|
+
pbar.finish
|
99
99
|
=begin
|
100
100
|
result = RubyProf.stop
|
101
101
|
printer = RubyProf::FlatPrinter.new(result)
|
@@ -180,10 +180,10 @@ module HTML2FB
|
|
180
180
|
if @content=='body'
|
181
181
|
tmp=el.preceding[0..-1]
|
182
182
|
else
|
183
|
-
tmp=el.root.
|
183
|
+
tmp=el.root.between(@content,(el.path),true)[1..-1]
|
184
184
|
end
|
185
185
|
if tmp.blank? #search can'find between siblins
|
186
|
-
tmp=el.root.deep_between(@content,(el.
|
186
|
+
tmp=el.root.deep_between(@content,(el.path))
|
187
187
|
end
|
188
188
|
unless tmp.blank?
|
189
189
|
tmph=tmp.to_html
|
@@ -195,7 +195,7 @@ module HTML2FB
|
|
195
195
|
(lvl..@max_level).to_a.reverse.each do |l|
|
196
196
|
close_section(l)
|
197
197
|
end
|
198
|
-
@starts[lvl]=create_fbsection(el.root.
|
198
|
+
@starts[lvl]=create_fbsection(el.root.at_xpath(obj[:xpath]).text,obj[:fblevel])
|
199
199
|
@content=obj[:xpath]
|
200
200
|
@current_level=lvl
|
201
201
|
end
|
@@ -209,7 +209,7 @@ module HTML2FB
|
|
209
209
|
end
|
210
210
|
|
211
211
|
def feed(el)
|
212
|
-
return if el.
|
212
|
+
return if el.text?
|
213
213
|
@done=[[]*@levels.size]
|
214
214
|
|
215
215
|
@levels.each_with_index do |lvl,i|
|
@@ -218,7 +218,7 @@ module HTML2FB
|
|
218
218
|
if el.in_search?(expr['expr'])
|
219
219
|
|
220
220
|
|
221
|
-
open_section({:xpath => el.
|
221
|
+
open_section({:xpath => el.path, :fblevel => expr['fblevel']},i+1,el)
|
222
222
|
break
|
223
223
|
end
|
224
224
|
end
|
@@ -228,6 +228,9 @@ module HTML2FB
|
|
228
228
|
end
|
229
229
|
end
|
230
230
|
|
231
|
+
class Nokogiri::XML::NodeSet
|
232
|
+
alias :blank? :empty?
|
233
|
+
end
|
231
234
|
|
232
235
|
class String
|
233
236
|
def blank?
|
@@ -241,17 +244,22 @@ class NilClass
|
|
241
244
|
end
|
242
245
|
end
|
243
246
|
|
244
|
-
|
247
|
+
|
248
|
+
|
249
|
+
class Nokogiri::XML::Node
|
250
|
+
|
245
251
|
def in_search?(expr)
|
246
252
|
if expr !~ /[^a-z0-9]/
|
247
253
|
return self.name.downcase()==expr.downcase()
|
248
254
|
end
|
249
255
|
|
250
|
-
se_in=self.
|
256
|
+
se_in=self.root
|
257
|
+
se_in=self.parent if self.respond_to?(:parent)
|
251
258
|
if expr[0..1]=='/'
|
252
259
|
se_in=self.root
|
253
260
|
end
|
254
|
-
se_in.search(expr)
|
261
|
+
set=se_in.search(expr) rescue se_in.xpath(expr)
|
262
|
+
set.each do |el|
|
255
263
|
return true if el==self
|
256
264
|
end
|
257
265
|
# puts self.name+" "+expr
|
@@ -259,34 +267,60 @@ module Hpricot::Traverse
|
|
259
267
|
end
|
260
268
|
|
261
269
|
def root
|
262
|
-
|
263
|
-
se_in=self
|
264
|
-
se_in=se_in.parent until se_in.parent.nil?
|
265
|
-
@root=se_in
|
266
|
-
se_in
|
270
|
+
self.document.root
|
267
271
|
end
|
268
272
|
|
269
|
-
def
|
270
|
-
|
273
|
+
def node_position
|
274
|
+
return @node_position if @node_position
|
275
|
+
@node_position=parent.children.index(self)
|
271
276
|
end
|
272
277
|
|
273
|
-
def
|
274
|
-
|
275
|
-
|
276
|
-
|
278
|
+
def between(a,b,excl=false)
|
279
|
+
|
280
|
+
#from nokogiri
|
281
|
+
offset=(excl ? -1 : 0)
|
282
|
+
ary = []
|
283
|
+
ele1=at(a) rescue at_xpath(a)
|
284
|
+
ele2=at(b) rescue at_xpath(b)
|
285
|
+
|
286
|
+
if ele1 and ele2
|
287
|
+
# let's quickly take care of siblings
|
288
|
+
if ele1.parent == ele2.parent
|
289
|
+
|
290
|
+
ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
|
291
|
+
else
|
292
|
+
# find common parent
|
293
|
+
ele1_p=ele1.ancestors
|
294
|
+
ele2_p=ele2.ancestors
|
295
|
+
common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.first
|
296
|
+
|
297
|
+
child = nil
|
298
|
+
if ele1 == common_parent
|
299
|
+
child = ele2
|
300
|
+
elsif ele2 == common_parent
|
301
|
+
child = ele1
|
302
|
+
end
|
303
|
+
|
304
|
+
if child
|
305
|
+
ary = common_parent.children[0..(child.node_position+offset)]
|
306
|
+
end
|
307
|
+
end
|
277
308
|
end
|
278
|
-
|
309
|
+
|
310
|
+
return Nokogiri::XML::NodeSet.new(ele1.document,ary)
|
279
311
|
end
|
280
|
-
def deep_between(i,j)
|
281
312
|
|
282
|
-
|
283
|
-
|
313
|
+
|
314
|
+
|
315
|
+
def deep_between(i,j)
|
316
|
+
unless j.nil? || self.at_xpath(j).nil?
|
317
|
+
tm=self.at_xpath(i)
|
284
318
|
prec=tm.deep_preceding
|
285
|
-
r=
|
319
|
+
r=Nokogiri::XML::NodeSet.new(tm.document,[*self.at(j).deep_preceding.find_all{|el| !(prec.include?el || el==tm)}])
|
286
320
|
else
|
287
321
|
r=self.at(i).deep_following unless self.at(i).nil?
|
288
322
|
end
|
289
|
-
|
323
|
+
Nokogiri::XML::NodeSet.new(self.document,[*select_end(r,i)])
|
290
324
|
end
|
291
325
|
|
292
326
|
def select_end(tab,expr)
|
@@ -296,13 +330,15 @@ module Hpricot::Traverse
|
|
296
330
|
idx=-1
|
297
331
|
i=0
|
298
332
|
tab.each do |e|
|
299
|
-
|
333
|
+
nxp=expr.gsub(e.path,'.')
|
334
|
+
set=e.search(nxp) rescue e.xpath(nxp)
|
335
|
+
if set.size > 0
|
300
336
|
idx=i
|
301
337
|
#if e.search(i).size > 0
|
302
|
-
if e.children.find{|ee| ee.
|
338
|
+
if e.children.find{|ee| ee.path==expr }
|
303
339
|
e.children.each do |ee|
|
304
340
|
s << ee if f
|
305
|
-
f=true if ee.
|
341
|
+
f=true if ee.path==expr
|
306
342
|
end
|
307
343
|
else
|
308
344
|
s=select_end(e.children,expr)
|
@@ -316,20 +352,24 @@ module Hpricot::Traverse
|
|
316
352
|
return s+tab[(idx+1)..-1]
|
317
353
|
end
|
318
354
|
|
355
|
+
def preceding
|
356
|
+
self.parent.children[0...node_position]
|
357
|
+
end
|
358
|
+
|
359
|
+
def following
|
360
|
+
self.parent.children[node_position+1..-1]
|
361
|
+
end
|
362
|
+
|
319
363
|
def deep_preceding()
|
320
|
-
ret=
|
321
|
-
ret+=parent.deep_preceding if respond_to?(:parent)
|
364
|
+
ret=Nokogiri::XML::NodeSet.new(self.document,[])
|
365
|
+
ret+=parent.deep_preceding if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
|
322
366
|
ret+=preceding
|
323
|
-
|
367
|
+
ret
|
324
368
|
end
|
325
369
|
def deep_following()
|
326
370
|
ret=following
|
327
|
-
ret+=parent.deep_following if respond_to?(:parent)
|
328
|
-
|
371
|
+
ret+=parent.deep_following if respond_to?(:parent) && !parent.is_a?(Nokogiri::XML::Document)
|
372
|
+
ret
|
329
373
|
end
|
330
374
|
|
331
375
|
end
|
332
|
-
|
333
|
-
class Hpricot::Elements
|
334
|
-
alias_method :blank?, :empty?
|
335
|
-
end
|