zetaben-Html2Feedbooks 1.0.5 → 1.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/html2fb.rb +5 -1
- data/lib/conf.rb +2 -1
- data/lib/parser.rb +19 -0
- metadata +1 -1
data/bin/html2fb.rb
CHANGED
@@ -14,6 +14,7 @@ include HTML2FB
|
|
14
14
|
options = {}
|
15
15
|
options[:conf] = "conf.yaml"
|
16
16
|
options[:preview] = true
|
17
|
+
options[:conv] = true
|
17
18
|
OptionParser.new do |opts|
|
18
19
|
opts.banner = "Usage: html2fb [options] URL"
|
19
20
|
|
@@ -23,6 +24,9 @@ OptionParser.new do |opts|
|
|
23
24
|
opts.on("-s", "-s","Send to feedbooks") do |f|
|
24
25
|
options[:preview] = !f
|
25
26
|
end
|
27
|
+
opts.on("-nc", "--no-conv","No charset conversion") do |f|
|
28
|
+
options[:conv] = !f
|
29
|
+
end
|
26
30
|
end.parse!
|
27
31
|
|
28
32
|
valid=false
|
@@ -40,7 +44,7 @@ while !valid
|
|
40
44
|
print "URL : " if entry.nil? || entry==''
|
41
45
|
entry=STDIN.readline.strip unless valid
|
42
46
|
end
|
43
|
-
conf=Conf.new(options[:conf])
|
47
|
+
conf=Conf.new(options[:conf],options[:conv])
|
44
48
|
content=Downloader.download(url)
|
45
49
|
#puts content.size
|
46
50
|
doc=Parser.new(conf).parse(content)
|
data/lib/conf.rb
CHANGED
@@ -2,13 +2,14 @@ require 'yaml'
|
|
2
2
|
|
3
3
|
module HTML2FB
|
4
4
|
class Conf
|
5
|
-
def initialize(file)
|
5
|
+
def initialize(file,conv)
|
6
6
|
['','./',"#{File.dirname(__FILE__)}/","#{File.dirname(__FILE__)}/../confs/"].each do |p|
|
7
7
|
f=p+file
|
8
8
|
begin
|
9
9
|
if File.readable?(f) && File.exists?(f)
|
10
10
|
@conf=File.open(f,'r'){|txt| YAML::load(txt)}
|
11
11
|
puts "loaded config file : "+f
|
12
|
+
@conf['conv']=conv
|
12
13
|
return
|
13
14
|
end
|
14
15
|
rescue Exception => e
|
data/lib/parser.rb
CHANGED
@@ -15,6 +15,25 @@ module HTML2FB
|
|
15
15
|
def parse(txt)
|
16
16
|
puts "Parsing HTML"
|
17
17
|
pdoc=Hpricot(txt)
|
18
|
+
if @conf['conv']
|
19
|
+
mc=pdoc/'meta[@http-equiv="Content-Type"]'
|
20
|
+
if mc.size>0
|
21
|
+
charset=mc.first.attributes['content'].split(';').find do |s|
|
22
|
+
s.strip[0,7]=='charset'
|
23
|
+
end
|
24
|
+
unless charset.nil?
|
25
|
+
tc=charset.split('=').last.strip
|
26
|
+
end
|
27
|
+
|
28
|
+
unless tc.nil?
|
29
|
+
puts "Trying to convert source encoding from #{tc} to utf-8"
|
30
|
+
require 'iconv'
|
31
|
+
pdoc=Hpricot(Iconv.conv('utf-8',tc.downcase,txt))
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
18
37
|
doc=Document.new
|
19
38
|
puts "Removing garbage elements"
|
20
39
|
remove_objs(pdoc)
|