penso-feedparser 0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +340 -0
- data/ChangeLog +59 -0
- data/LICENSE +60 -0
- data/README +14 -0
- data/Rakefile +85 -0
- data/lib/feedparser.rb +28 -0
- data/lib/feedparser/feedparser.rb +372 -0
- data/lib/feedparser/filesizes.rb +14 -0
- data/lib/feedparser/html-output.rb +126 -0
- data/lib/feedparser/html2text-parser.rb +413 -0
- data/lib/feedparser/rexml_patch.rb +28 -0
- data/lib/feedparser/sgml-parser.rb +332 -0
- data/lib/feedparser/text-output.rb +108 -0
- data/lib/feedparser/textconverters.rb +120 -0
- data/setup.rb +1586 -0
- data/test/tc_feed_parse.rb +117 -0
- data/test/tc_htmloutput.rb +52 -0
- data/test/tc_parser.rb +48 -0
- data/test/tc_textoutput.rb +48 -0
- data/test/tc_textwrappedoutput.rb +48 -0
- data/test/ts_feedparser.rb +12 -0
- data/tools/doctoweb.bash +30 -0
- metadata +85 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'feedparser/textconverters'
|
2
|
+
|
3
|
+
# Patch for REXML
|
4
|
+
# Very ugly patch to make REXML error-proof.
|
5
|
+
# The problem is REXML uses IConv, which isn't error-proof at all.
|
6
|
+
# With those changes, it uses unpack/pack with some error handling
|
7
|
+
module REXML
|
8
|
+
module Encoding
|
9
|
+
def decode(str)
|
10
|
+
return str.toUTF8(@encoding)
|
11
|
+
end
|
12
|
+
|
13
|
+
def encode(str)
|
14
|
+
return str
|
15
|
+
end
|
16
|
+
|
17
|
+
def encoding=(enc)
|
18
|
+
return if defined? @encoding and enc == @encoding
|
19
|
+
@encoding = enc || 'utf-8'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Element
|
24
|
+
def children
|
25
|
+
@children
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,332 @@
|
|
1
|
+
# A parser for SGML, using the derived class as static DTD.
|
2
|
+
# from http://raa.ruby-lang.org/project/html-parser
|
3
|
+
module FeedParser
|
4
|
+
class SGMLParser
|
5
|
+
# Regular expressions used for parsing:
|
6
|
+
Interesting = /[&<]/
|
7
|
+
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
|
8
|
+
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
|
9
|
+
'![^<>]*)?')
|
10
|
+
|
11
|
+
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
|
12
|
+
Charref = /&#([0-9]+);/
|
13
|
+
|
14
|
+
Starttagopen = /<[>a-zA-Z]/
|
15
|
+
Endtagopen = /<\/[<>a-zA-Z]/
|
16
|
+
Endbracket = /[<>]/
|
17
|
+
Special = /<![^<>]*>/
|
18
|
+
Commentopen = /<!--/
|
19
|
+
Commentclose = /--[ \t\n]*>/
|
20
|
+
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
|
21
|
+
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
|
22
|
+
'(\s*=\s*' +
|
23
|
+
"('[^']*'" +
|
24
|
+
'|"[^"]*"' +
|
25
|
+
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
|
26
|
+
|
27
|
+
Entitydefs =
|
28
|
+
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
|
29
|
+
|
30
|
+
def initialize(verbose=false)
|
31
|
+
@verbose = verbose
|
32
|
+
reset
|
33
|
+
end
|
34
|
+
|
35
|
+
def reset
|
36
|
+
@rawdata = ''
|
37
|
+
@stack = []
|
38
|
+
@lasttag = '???'
|
39
|
+
@nomoretags = false
|
40
|
+
@literal = false
|
41
|
+
end
|
42
|
+
|
43
|
+
def has_context(gi)
|
44
|
+
@stack.include? gi
|
45
|
+
end
|
46
|
+
|
47
|
+
def setnomoretags
|
48
|
+
@nomoretags = true
|
49
|
+
@literal = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def setliteral(*args)
|
53
|
+
@literal = true
|
54
|
+
end
|
55
|
+
|
56
|
+
def feed(data)
|
57
|
+
@rawdata << data
|
58
|
+
goahead(false)
|
59
|
+
end
|
60
|
+
|
61
|
+
def close
|
62
|
+
goahead(true)
|
63
|
+
end
|
64
|
+
|
65
|
+
def goahead(_end)
|
66
|
+
rawdata = @rawdata
|
67
|
+
i = 0
|
68
|
+
n = rawdata.length
|
69
|
+
while i < n
|
70
|
+
if @nomoretags
|
71
|
+
handle_data(rawdata[i..(n-1)])
|
72
|
+
i = n
|
73
|
+
break
|
74
|
+
end
|
75
|
+
j = rawdata.index(Interesting, i)
|
76
|
+
j = n unless j
|
77
|
+
if i < j
|
78
|
+
handle_data(rawdata[i..(j-1)])
|
79
|
+
end
|
80
|
+
i = j
|
81
|
+
break if (i == n)
|
82
|
+
if rawdata[i] == ?< #
|
83
|
+
if rawdata.index(Starttagopen, i) == i
|
84
|
+
if @literal
|
85
|
+
handle_data(rawdata[i, 1])
|
86
|
+
i += 1
|
87
|
+
next
|
88
|
+
end
|
89
|
+
k = parse_starttag(i)
|
90
|
+
break unless k
|
91
|
+
i = k
|
92
|
+
next
|
93
|
+
end
|
94
|
+
if rawdata.index(Endtagopen, i) == i
|
95
|
+
k = parse_endtag(i)
|
96
|
+
break unless k
|
97
|
+
i = k
|
98
|
+
@literal = false
|
99
|
+
next
|
100
|
+
end
|
101
|
+
if rawdata.index(Commentopen, i) == i
|
102
|
+
if @literal
|
103
|
+
handle_data(rawdata[i,1])
|
104
|
+
i += 1
|
105
|
+
next
|
106
|
+
end
|
107
|
+
k = parse_comment(i)
|
108
|
+
break unless k
|
109
|
+
i += k
|
110
|
+
next
|
111
|
+
end
|
112
|
+
if rawdata.index(Special, i) == i
|
113
|
+
if @literal
|
114
|
+
handle_data(rawdata[i, 1])
|
115
|
+
i += 1
|
116
|
+
next
|
117
|
+
end
|
118
|
+
k = parse_special(i)
|
119
|
+
break unless k
|
120
|
+
i += k
|
121
|
+
next
|
122
|
+
end
|
123
|
+
elsif rawdata[i] == ?& #
|
124
|
+
if rawdata.index(Charref, i) == i
|
125
|
+
i += $&.length
|
126
|
+
handle_charref($1)
|
127
|
+
i -= 1 unless rawdata[i-1] == ?;
|
128
|
+
next
|
129
|
+
end
|
130
|
+
if rawdata.index(Entityref, i) == i
|
131
|
+
i += $&.length
|
132
|
+
handle_entityref($1)
|
133
|
+
i -= 1 unless rawdata[i-1] == ?;
|
134
|
+
next
|
135
|
+
end
|
136
|
+
else
|
137
|
+
raise RuntimeError, 'neither < nor & ??'
|
138
|
+
end
|
139
|
+
# We get here only if incomplete matches but
|
140
|
+
# nothing else
|
141
|
+
match = rawdata.index(Incomplete, i)
|
142
|
+
unless match == i
|
143
|
+
handle_data(rawdata[i, 1])
|
144
|
+
i += 1
|
145
|
+
next
|
146
|
+
end
|
147
|
+
j = match + $&.length
|
148
|
+
break if j == n # Really incomplete
|
149
|
+
handle_data(rawdata[i..(j-1)])
|
150
|
+
i = j
|
151
|
+
end
|
152
|
+
# end while
|
153
|
+
if _end and i < n
|
154
|
+
handle_data(@rawdata[i..(n-1)])
|
155
|
+
i = n
|
156
|
+
end
|
157
|
+
@rawdata = rawdata[i..-1]
|
158
|
+
end
|
159
|
+
|
160
|
+
def parse_comment(i)
|
161
|
+
rawdata = @rawdata
|
162
|
+
if rawdata[i, 4] != '<!--'
|
163
|
+
raise RuntimeError, 'unexpected call to handle_comment'
|
164
|
+
end
|
165
|
+
match = rawdata.index(Commentclose, i)
|
166
|
+
return nil unless match
|
167
|
+
matched_length = $&.length
|
168
|
+
j = match
|
169
|
+
handle_comment(rawdata[i+4..(j-1)])
|
170
|
+
j = match + matched_length
|
171
|
+
return j-i
|
172
|
+
end
|
173
|
+
|
174
|
+
def parse_starttag(i)
|
175
|
+
rawdata = @rawdata
|
176
|
+
j = rawdata.index(Endbracket, i + 1)
|
177
|
+
return nil unless j
|
178
|
+
attrs = []
|
179
|
+
if rawdata[i+1] == ?> #
|
180
|
+
# SGML shorthand: <> == <last open tag seen>
|
181
|
+
k = j
|
182
|
+
tag = @lasttag
|
183
|
+
else
|
184
|
+
match = rawdata.index(Tagfind, i + 1)
|
185
|
+
unless match
|
186
|
+
raise RuntimeError, 'unexpected call to parse_starttag'
|
187
|
+
end
|
188
|
+
k = i + 1 + ($&.length)
|
189
|
+
tag = $&.downcase
|
190
|
+
@lasttag = tag
|
191
|
+
end
|
192
|
+
while k < j
|
193
|
+
break unless rawdata.index(Attrfind, k)
|
194
|
+
matched_length = $&.length
|
195
|
+
attrname, rest, attrvalue = $1, $2, $3
|
196
|
+
if not rest
|
197
|
+
attrvalue = '' # was: = attrname
|
198
|
+
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
|
199
|
+
(attrvalue[0] == ?" && attrvalue[-1,1] == ?")
|
200
|
+
attrvalue = attrvalue[1..-2]
|
201
|
+
end
|
202
|
+
attrs << [attrname.downcase, attrvalue]
|
203
|
+
k += matched_length
|
204
|
+
end
|
205
|
+
if rawdata[j] == ?> #
|
206
|
+
j += 1
|
207
|
+
end
|
208
|
+
finish_starttag(tag, attrs)
|
209
|
+
return j
|
210
|
+
end
|
211
|
+
|
212
|
+
def parse_endtag(i)
|
213
|
+
rawdata = @rawdata
|
214
|
+
j = rawdata.index(Endbracket, i + 1)
|
215
|
+
return nil unless j
|
216
|
+
tag = (rawdata[i+2..j-1].strip).downcase
|
217
|
+
if rawdata[j] == ?> #
|
218
|
+
j += 1
|
219
|
+
end
|
220
|
+
finish_endtag(tag)
|
221
|
+
return j
|
222
|
+
end
|
223
|
+
|
224
|
+
def finish_starttag(tag, attrs)
|
225
|
+
method = 'start_' + tag
|
226
|
+
if self.respond_to?(method)
|
227
|
+
@stack << tag
|
228
|
+
handle_starttag(tag, method, attrs)
|
229
|
+
return 1
|
230
|
+
else
|
231
|
+
method = 'do_' + tag
|
232
|
+
if self.respond_to?(method)
|
233
|
+
handle_starttag(tag, method, attrs)
|
234
|
+
return 0
|
235
|
+
else
|
236
|
+
unknown_starttag(tag, attrs)
|
237
|
+
return -1
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def finish_endtag(tag)
|
243
|
+
if tag == ''
|
244
|
+
found = @stack.length - 1
|
245
|
+
if found < 0
|
246
|
+
unknown_endtag(tag)
|
247
|
+
return
|
248
|
+
end
|
249
|
+
else
|
250
|
+
unless @stack.include? tag
|
251
|
+
method = 'end_' + tag
|
252
|
+
unless self.respond_to?(method)
|
253
|
+
unknown_endtag(tag)
|
254
|
+
end
|
255
|
+
return
|
256
|
+
end
|
257
|
+
found = @stack.index(tag) #or @stack.length
|
258
|
+
end
|
259
|
+
while @stack.length > found
|
260
|
+
tag = @stack[-1]
|
261
|
+
method = 'end_' + tag
|
262
|
+
if respond_to?(method)
|
263
|
+
handle_endtag(tag, method)
|
264
|
+
else
|
265
|
+
unknown_endtag(tag)
|
266
|
+
end
|
267
|
+
@stack.pop
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
def parse_special(i)
|
272
|
+
rawdata = @rawdata
|
273
|
+
match = rawdata.index(Endbracket, i+1)
|
274
|
+
return nil unless match
|
275
|
+
matched_length = $&.length
|
276
|
+
handle_special(rawdata[i+1..(match-1)])
|
277
|
+
return match - i + matched_length
|
278
|
+
end
|
279
|
+
|
280
|
+
def handle_starttag(tag, method, attrs)
|
281
|
+
self.send(method, attrs)
|
282
|
+
end
|
283
|
+
|
284
|
+
def handle_endtag(tag, method)
|
285
|
+
self.send(method)
|
286
|
+
end
|
287
|
+
|
288
|
+
def report_unbalanced(tag)
|
289
|
+
if @verbose
|
290
|
+
print '*** Unbalanced </' + tag + '>', "\n"
|
291
|
+
print '*** Stack:', self.stack, "\n"
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
def handle_charref(name)
|
296
|
+
n = name.to_i
|
297
|
+
if !(0 <= n && n <= 255)
|
298
|
+
unknown_charref(name)
|
299
|
+
return
|
300
|
+
end
|
301
|
+
handle_data(n.chr)
|
302
|
+
end
|
303
|
+
|
304
|
+
def handle_entityref(name)
|
305
|
+
table = Entitydefs
|
306
|
+
if table.include?(name)
|
307
|
+
handle_data(table[name])
|
308
|
+
else
|
309
|
+
unknown_entityref(name)
|
310
|
+
return
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
def handle_data(data)
|
315
|
+
end
|
316
|
+
|
317
|
+
def handle_comment(data)
|
318
|
+
end
|
319
|
+
|
320
|
+
def handle_special(data)
|
321
|
+
end
|
322
|
+
|
323
|
+
def unknown_starttag(tag, attrs)
|
324
|
+
end
|
325
|
+
def unknown_endtag(tag)
|
326
|
+
end
|
327
|
+
def unknown_charref(ref)
|
328
|
+
end
|
329
|
+
def unknown_entityref(ref)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'feedparser'
|
2
|
+
require 'feedparser/html2text-parser'
|
3
|
+
require 'feedparser/filesizes'
|
4
|
+
|
5
|
+
class String
|
6
|
+
# Convert an HTML text to plain text
|
7
|
+
def html2text(wrapto = false)
|
8
|
+
text = self.clone
|
9
|
+
# parse HTML
|
10
|
+
p = FeedParser::HTML2TextParser::new(true)
|
11
|
+
p.feed(text)
|
12
|
+
p.close
|
13
|
+
text = p.savedata
|
14
|
+
# remove leading and trailing whilespace
|
15
|
+
text.gsub!(/\A\s*/m, '')
|
16
|
+
text.gsub!(/\s*\Z/m, '')
|
17
|
+
# remove whitespace around \n
|
18
|
+
text.gsub!(/ *\n/m, "\n")
|
19
|
+
text.gsub!(/\n */m, "\n")
|
20
|
+
# and duplicates \n
|
21
|
+
text.gsub!(/\n\n+/m, "\n\n")
|
22
|
+
# and remove duplicated whitespace
|
23
|
+
text.gsub!(/[ \t]+/, ' ')
|
24
|
+
|
25
|
+
# finally, wrap the text if requested
|
26
|
+
return wrap_text(text, wrapto) if wrapto
|
27
|
+
text
|
28
|
+
end
|
29
|
+
|
30
|
+
def wrap_text(text, wrapto = 72)
|
31
|
+
text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
module FeedParser
|
36
|
+
class Feed
|
37
|
+
def to_text(localtime = true, wrapto = false)
|
38
|
+
s = ''
|
39
|
+
s += "Type: #{@type}\n"
|
40
|
+
s += "Encoding: #{@encoding}\n"
|
41
|
+
s += "Title: #{@title}\n"
|
42
|
+
s += "Link: #{@link}\n"
|
43
|
+
if @description
|
44
|
+
s += "Description: #{@description.html2text}\n"
|
45
|
+
else
|
46
|
+
s += "Description:\n"
|
47
|
+
end
|
48
|
+
s += "Creator: #{@creator}\n"
|
49
|
+
s += "\n"
|
50
|
+
@items.each do |i|
|
51
|
+
s += '*' * 40 + "\n"
|
52
|
+
s += i.to_text(localtime, wrapto)
|
53
|
+
end
|
54
|
+
s
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class FeedItem
|
59
|
+
def to_text(localtime = true, wrapto = false, header = true)
|
60
|
+
s = ""
|
61
|
+
if header
|
62
|
+
s += "Item: "
|
63
|
+
s += @title if @title
|
64
|
+
s += "\n<#{@link}>" if @link
|
65
|
+
if @date
|
66
|
+
if localtime
|
67
|
+
s += "\nDate: #{@date.to_s}"
|
68
|
+
else
|
69
|
+
s += "\nDate: #{@date.getutc.to_s}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
s += "\n"
|
73
|
+
else
|
74
|
+
s += "<#{@link}>\n\n" if @link
|
75
|
+
end
|
76
|
+
s += "#{@content.html2text(wrapto).chomp}\n" if @content
|
77
|
+
if @enclosures and @enclosures.length > 0
|
78
|
+
s += "\nFiles:"
|
79
|
+
@enclosures.each do |e|
|
80
|
+
s += "\n #{e[0]} (#{e[1].to_i.to_human_readable}, #{e[2]})"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
if not header
|
84
|
+
s += "-- "
|
85
|
+
end
|
86
|
+
s += "\nFeed: "
|
87
|
+
s += @feed.title if @feed.title
|
88
|
+
s += "\n<#{@feed.link}>" if @feed.link
|
89
|
+
if not header
|
90
|
+
s += "\nItem: "
|
91
|
+
s += @title if @title
|
92
|
+
s += "\n<#{@link}>" if @link
|
93
|
+
if @date
|
94
|
+
if localtime
|
95
|
+
s += "\nDate: #{@date.to_s}"
|
96
|
+
else
|
97
|
+
s += "\nDate: #{@date.getutc.to_s}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
s += "\nAuthor: #{creator}" if creator
|
102
|
+
s += "\nSubject: #{@subject}" if @subject
|
103
|
+
s += "\nFiled under: #{@categories.join(', ')}" unless @categories.empty?
|
104
|
+
s += "\n" # final newline, for compat with history
|
105
|
+
s
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|