feed_yamlizer 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/feed2yaml +8 -2
- data/lib/feed_yamlizer/html_cleaner.rb +1 -1
- data/lib/feed_yamlizer/html_listener.rb +30 -5
- data/lib/feed_yamlizer/version.rb +1 -1
- data/lib/feed_yamlizer.rb +5 -9
- metadata +3 -3
data/bin/feed2yaml
CHANGED
@@ -8,7 +8,7 @@ rescue LoadError
|
|
8
8
|
end
|
9
9
|
require 'open-uri'
|
10
10
|
|
11
|
-
#
|
11
|
+
# just prints the text, not yaml
|
12
12
|
def print_text(res)
|
13
13
|
res[:items].each {|x|
|
14
14
|
puts '-' * 30
|
@@ -18,13 +18,19 @@ def print_text(res)
|
|
18
18
|
}
|
19
19
|
end
|
20
20
|
|
21
|
+
if ARGV.first == '-t' # text
|
22
|
+
puts "Printing text form"
|
23
|
+
@text = true
|
24
|
+
ARGV.shift
|
25
|
+
end
|
26
|
+
|
21
27
|
result = if STDIN.tty?
|
22
28
|
FeedYamlizer.process_url ARGV.first
|
23
29
|
else
|
24
30
|
FeedYamlizer.process_xml STDIN.read
|
25
31
|
end
|
26
32
|
|
27
|
-
if
|
33
|
+
if @text
|
28
34
|
print_text result
|
29
35
|
else
|
30
36
|
puts result.to_yaml
|
@@ -43,7 +43,7 @@ class FeedYamlizer
|
|
43
43
|
#output = IO.popen("tidy -q -n -wrap 120 -asxml -latin1", "r+") do |pipe|
|
44
44
|
#output = IO.popen("tidy -q -wrap 120 -raw -asxml ", "r+") do |pipe| # if from latin1
|
45
45
|
|
46
|
-
tidy = "tidy -q -wrap 120 -n -utf8 -asxml 2>/dev/null"
|
46
|
+
tidy = "tidy -q -wrap 120 -n -raw -utf8 -asxml 2>/dev/null"
|
47
47
|
output = IO.popen(tidy, "r+") do |pipe|
|
48
48
|
input = <<-END
|
49
49
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
@@ -24,8 +24,34 @@ class FeedYamlizer
|
|
24
24
|
|
25
25
|
digits = @links.size.to_s.size
|
26
26
|
|
27
|
+
# wrap the text
|
27
28
|
x = format(x)
|
28
29
|
|
30
|
+
# delete extra blank lines
|
31
|
+
x = x.split(/\n\n+/).join("\n\n")
|
32
|
+
|
33
|
+
# format the blockquotes
|
34
|
+
line_buffer = []
|
35
|
+
blockquote_buffer = []
|
36
|
+
inblock = false
|
37
|
+
x.split(/\n/).each do |line|
|
38
|
+
if line == '[blockquote]'
|
39
|
+
inblock = true
|
40
|
+
elsif line == '[/blockquote]'
|
41
|
+
inblock = false
|
42
|
+
block = blockquote_buffer.join("\n")
|
43
|
+
line_buffer << format(block, '-c')
|
44
|
+
blockquote_buffer = []
|
45
|
+
else
|
46
|
+
if inblock
|
47
|
+
blockquote_buffer << " " * 4 + line.to_s
|
48
|
+
else
|
49
|
+
line_buffer << line
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
x = line_buffer.join("\n")
|
54
|
+
|
29
55
|
x + "\n\n" + @links.map {|x|
|
30
56
|
gutter = x[:index].to_s.rjust(digits)
|
31
57
|
if x[:content] && x[:content].strip.length > 0
|
@@ -48,7 +74,7 @@ class FeedYamlizer
|
|
48
74
|
@in_link = true
|
49
75
|
when 'img'
|
50
76
|
text = attrs['alt'] || attrs['title']
|
51
|
-
chunk = [
|
77
|
+
chunk = "[img:#{text}] "
|
52
78
|
@content[-1] << chunk
|
53
79
|
when *HEADER_TAGS
|
54
80
|
@content << "<#{UNIFORM_HEADER_TAG}>"
|
@@ -57,7 +83,7 @@ class FeedYamlizer
|
|
57
83
|
# @content << ""
|
58
84
|
@content[-1] += " "
|
59
85
|
when 'blockquote'
|
60
|
-
@content
|
86
|
+
@content += ["[blockquote]", ""]
|
61
87
|
when 'ul', 'ol', 'dl'
|
62
88
|
@content << "<#{name}>"
|
63
89
|
when 'li', 'dt', 'dd'
|
@@ -102,7 +128,6 @@ class FeedYamlizer
|
|
102
128
|
return
|
103
129
|
end
|
104
130
|
|
105
|
-
# probably slow, but ok for now
|
106
131
|
@content[-1] << text
|
107
132
|
end
|
108
133
|
|
@@ -114,8 +139,8 @@ class FeedYamlizer
|
|
114
139
|
@nested_tags.join('/')
|
115
140
|
end
|
116
141
|
|
117
|
-
def format(x)
|
118
|
-
IO.popen("fmt", "r+") do |pipe|
|
142
|
+
def format(x, flags='')
|
143
|
+
IO.popen("fmt #{flags}", "r+") do |pipe|
|
119
144
|
pipe.puts x
|
120
145
|
pipe.close_write
|
121
146
|
pipe.read
|
data/lib/feed_yamlizer.rb
CHANGED
@@ -14,6 +14,7 @@ require 'feed_yamlizer/textifier'
|
|
14
14
|
require 'fileutils'
|
15
15
|
require 'yaml'
|
16
16
|
require 'htmlentities'
|
17
|
+
require 'string_ext'
|
17
18
|
|
18
19
|
class FeedYamlizer
|
19
20
|
include FileUtils::Verbose
|
@@ -70,16 +71,11 @@ class FeedYamlizer
|
|
70
71
|
|
71
72
|
class << self
|
72
73
|
def xml_encoding(rawxml)
|
73
|
-
|
74
|
-
encoding = x && x[0] && x[0][0]
|
74
|
+
encoding = rawxml[/encoding=["']([^"']+)["']/,1]
|
75
75
|
STDERR.puts "xml encoding: #{encoding.inspect}"
|
76
76
|
encoding
|
77
77
|
end
|
78
78
|
|
79
|
-
def to_utf(x, encoding = 'ISO-8859-1')
|
80
|
-
x = Iconv.conv("UTF-8//TRANSLIT//IGNORE", encoding, x)
|
81
|
-
end
|
82
|
-
|
83
79
|
def check_for_tidy
|
84
80
|
if `which tidy` == ''
|
85
81
|
abort "Please install tidy"
|
@@ -87,9 +83,9 @@ class FeedYamlizer
|
|
87
83
|
end
|
88
84
|
|
89
85
|
# main method
|
90
|
-
def run(feed_xml, encoding)
|
86
|
+
def run(feed_xml, encoding='UTF-8')
|
91
87
|
check_for_tidy
|
92
|
-
feed_xml =
|
88
|
+
feed_xml = Iconv.conv("UTF-8//TRANSLIT//IGNORE", encoding, feed_xml)
|
93
89
|
parsed_data = FeedYamlizer::FeedParser.new(feed_xml).result
|
94
90
|
result = FeedYamlizer.new(parsed_data).result
|
95
91
|
result
|
@@ -104,7 +100,7 @@ class FeedYamlizer
|
|
104
100
|
charset = response.charset
|
105
101
|
#STDERR.puts "charset: #{charset}"
|
106
102
|
xml = response.read
|
107
|
-
encoding = charset || xml_encoding(xml) || "
|
103
|
+
encoding = charset || xml_encoding(xml) || "UTF-8"
|
108
104
|
run xml, encoding
|
109
105
|
end
|
110
106
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 6
|
9
|
+
version: 0.0.6
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Daniel Choi
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-20 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|