ruby-feedparser 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +340 -0
- data/ChangeLog +59 -0
- data/LICENSE +60 -0
- data/README +14 -0
- data/Rakefile +84 -0
- data/lib/feedparser.rb +28 -0
- data/lib/feedparser/feedparser.rb +343 -0
- data/lib/feedparser/filesizes.rb +14 -0
- data/lib/feedparser/html-output.rb +126 -0
- data/lib/feedparser/html2text-parser.rb +413 -0
- data/lib/feedparser/rexml_patch.rb +28 -0
- data/lib/feedparser/sgml-parser.rb +332 -0
- data/lib/feedparser/text-output.rb +108 -0
- data/lib/feedparser/textconverters.rb +120 -0
- data/setup.rb +1586 -0
- data/test/tc_feed_parse.rb +117 -0
- data/test/tc_htmloutput.rb +52 -0
- data/test/tc_parser.rb +48 -0
- data/test/tc_textoutput.rb +48 -0
- data/test/tc_textwrappedoutput.rb +48 -0
- data/test/ts_feedparser.rb +12 -0
- data/tools/doctoweb.bash +30 -0
- metadata +76 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'feedparser/textconverters'
|
2
|
+
|
3
|
+
# Patch for REXML
|
4
|
+
# Very ugly patch to make REXML error-proof.
|
5
|
+
# The problem is REXML uses IConv, which isn't error-proof at all.
|
6
|
+
# With those changes, it uses unpack/pack with some error handling
|
7
|
+
module REXML
|
8
|
+
module Encoding
|
9
|
+
def decode(str)
|
10
|
+
return str.toUTF8(@encoding)
|
11
|
+
end
|
12
|
+
|
13
|
+
def encode(str)
|
14
|
+
return str
|
15
|
+
end
|
16
|
+
|
17
|
+
def encoding=(enc)
|
18
|
+
return if defined? @encoding and enc == @encoding
|
19
|
+
@encoding = enc || 'utf-8'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Element
|
24
|
+
def children
|
25
|
+
@children
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,332 @@
|
|
1
|
+
# A parser for SGML, using the derived class as static DTD.
|
2
|
+
# from http://raa.ruby-lang.org/project/html-parser
|
3
|
+
module FeedParser
|
4
|
+
class SGMLParser
|
5
|
+
# Regular expressions used for parsing:
|
6
|
+
Interesting = /[&<]/
|
7
|
+
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
|
8
|
+
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
|
9
|
+
'![^<>]*)?')
|
10
|
+
|
11
|
+
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
|
12
|
+
Charref = /&#([0-9]+);/
|
13
|
+
|
14
|
+
Starttagopen = /<[>a-zA-Z]/
|
15
|
+
Endtagopen = /<\/[<>a-zA-Z]/
|
16
|
+
Endbracket = /[<>]/
|
17
|
+
Special = /<![^<>]*>/
|
18
|
+
Commentopen = /<!--/
|
19
|
+
Commentclose = /--[ \t\n]*>/
|
20
|
+
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
|
21
|
+
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
|
22
|
+
'(\s*=\s*' +
|
23
|
+
"('[^']*'" +
|
24
|
+
'|"[^"]*"' +
|
25
|
+
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
|
26
|
+
|
27
|
+
Entitydefs =
|
28
|
+
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
|
29
|
+
|
30
|
+
def initialize(verbose=false)
|
31
|
+
@verbose = verbose
|
32
|
+
reset
|
33
|
+
end
|
34
|
+
|
35
|
+
def reset
|
36
|
+
@rawdata = ''
|
37
|
+
@stack = []
|
38
|
+
@lasttag = '???'
|
39
|
+
@nomoretags = false
|
40
|
+
@literal = false
|
41
|
+
end
|
42
|
+
|
43
|
+
def has_context(gi)
|
44
|
+
@stack.include? gi
|
45
|
+
end
|
46
|
+
|
47
|
+
def setnomoretags
|
48
|
+
@nomoretags = true
|
49
|
+
@literal = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def setliteral(*args)
|
53
|
+
@literal = true
|
54
|
+
end
|
55
|
+
|
56
|
+
def feed(data)
|
57
|
+
@rawdata << data
|
58
|
+
goahead(false)
|
59
|
+
end
|
60
|
+
|
61
|
+
def close
|
62
|
+
goahead(true)
|
63
|
+
end
|
64
|
+
|
65
|
+
def goahead(_end)
|
66
|
+
rawdata = @rawdata
|
67
|
+
i = 0
|
68
|
+
n = rawdata.length
|
69
|
+
while i < n
|
70
|
+
if @nomoretags
|
71
|
+
handle_data(rawdata[i..(n-1)])
|
72
|
+
i = n
|
73
|
+
break
|
74
|
+
end
|
75
|
+
j = rawdata.index(Interesting, i)
|
76
|
+
j = n unless j
|
77
|
+
if i < j
|
78
|
+
handle_data(rawdata[i..(j-1)])
|
79
|
+
end
|
80
|
+
i = j
|
81
|
+
break if (i == n)
|
82
|
+
if rawdata[i] == ?< #
|
83
|
+
if rawdata.index(Starttagopen, i) == i
|
84
|
+
if @literal
|
85
|
+
handle_data(rawdata[i, 1])
|
86
|
+
i += 1
|
87
|
+
next
|
88
|
+
end
|
89
|
+
k = parse_starttag(i)
|
90
|
+
break unless k
|
91
|
+
i = k
|
92
|
+
next
|
93
|
+
end
|
94
|
+
if rawdata.index(Endtagopen, i) == i
|
95
|
+
k = parse_endtag(i)
|
96
|
+
break unless k
|
97
|
+
i = k
|
98
|
+
@literal = false
|
99
|
+
next
|
100
|
+
end
|
101
|
+
if rawdata.index(Commentopen, i) == i
|
102
|
+
if @literal
|
103
|
+
handle_data(rawdata[i,1])
|
104
|
+
i += 1
|
105
|
+
next
|
106
|
+
end
|
107
|
+
k = parse_comment(i)
|
108
|
+
break unless k
|
109
|
+
i += k
|
110
|
+
next
|
111
|
+
end
|
112
|
+
if rawdata.index(Special, i) == i
|
113
|
+
if @literal
|
114
|
+
handle_data(rawdata[i, 1])
|
115
|
+
i += 1
|
116
|
+
next
|
117
|
+
end
|
118
|
+
k = parse_special(i)
|
119
|
+
break unless k
|
120
|
+
i += k
|
121
|
+
next
|
122
|
+
end
|
123
|
+
elsif rawdata[i] == ?& #
|
124
|
+
if rawdata.index(Charref, i) == i
|
125
|
+
i += $&.length
|
126
|
+
handle_charref($1)
|
127
|
+
i -= 1 unless rawdata[i-1] == ?;
|
128
|
+
next
|
129
|
+
end
|
130
|
+
if rawdata.index(Entityref, i) == i
|
131
|
+
i += $&.length
|
132
|
+
handle_entityref($1)
|
133
|
+
i -= 1 unless rawdata[i-1] == ?;
|
134
|
+
next
|
135
|
+
end
|
136
|
+
else
|
137
|
+
raise RuntimeError, 'neither < nor & ??'
|
138
|
+
end
|
139
|
+
# We get here only if incomplete matches but
|
140
|
+
# nothing else
|
141
|
+
match = rawdata.index(Incomplete, i)
|
142
|
+
unless match == i
|
143
|
+
handle_data(rawdata[i, 1])
|
144
|
+
i += 1
|
145
|
+
next
|
146
|
+
end
|
147
|
+
j = match + $&.length
|
148
|
+
break if j == n # Really incomplete
|
149
|
+
handle_data(rawdata[i..(j-1)])
|
150
|
+
i = j
|
151
|
+
end
|
152
|
+
# end while
|
153
|
+
if _end and i < n
|
154
|
+
handle_data(@rawdata[i..(n-1)])
|
155
|
+
i = n
|
156
|
+
end
|
157
|
+
@rawdata = rawdata[i..-1]
|
158
|
+
end
|
159
|
+
|
160
|
+
def parse_comment(i)
|
161
|
+
rawdata = @rawdata
|
162
|
+
if rawdata[i, 4] != '<!--'
|
163
|
+
raise RuntimeError, 'unexpected call to handle_comment'
|
164
|
+
end
|
165
|
+
match = rawdata.index(Commentclose, i)
|
166
|
+
return nil unless match
|
167
|
+
matched_length = $&.length
|
168
|
+
j = match
|
169
|
+
handle_comment(rawdata[i+4..(j-1)])
|
170
|
+
j = match + matched_length
|
171
|
+
return j-i
|
172
|
+
end
|
173
|
+
|
174
|
+
def parse_starttag(i)
|
175
|
+
rawdata = @rawdata
|
176
|
+
j = rawdata.index(Endbracket, i + 1)
|
177
|
+
return nil unless j
|
178
|
+
attrs = []
|
179
|
+
if rawdata[i+1] == ?> #
|
180
|
+
# SGML shorthand: <> == <last open tag seen>
|
181
|
+
k = j
|
182
|
+
tag = @lasttag
|
183
|
+
else
|
184
|
+
match = rawdata.index(Tagfind, i + 1)
|
185
|
+
unless match
|
186
|
+
raise RuntimeError, 'unexpected call to parse_starttag'
|
187
|
+
end
|
188
|
+
k = i + 1 + ($&.length)
|
189
|
+
tag = $&.downcase
|
190
|
+
@lasttag = tag
|
191
|
+
end
|
192
|
+
while k < j
|
193
|
+
break unless rawdata.index(Attrfind, k)
|
194
|
+
matched_length = $&.length
|
195
|
+
attrname, rest, attrvalue = $1, $2, $3
|
196
|
+
if not rest
|
197
|
+
attrvalue = '' # was: = attrname
|
198
|
+
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
|
199
|
+
(attrvalue[0] == ?" && attrvalue[-1,1] == ?")
|
200
|
+
attrvalue = attrvalue[1..-2]
|
201
|
+
end
|
202
|
+
attrs << [attrname.downcase, attrvalue]
|
203
|
+
k += matched_length
|
204
|
+
end
|
205
|
+
if rawdata[j] == ?> #
|
206
|
+
j += 1
|
207
|
+
end
|
208
|
+
finish_starttag(tag, attrs)
|
209
|
+
return j
|
210
|
+
end
|
211
|
+
|
212
|
+
def parse_endtag(i)
|
213
|
+
rawdata = @rawdata
|
214
|
+
j = rawdata.index(Endbracket, i + 1)
|
215
|
+
return nil unless j
|
216
|
+
tag = (rawdata[i+2..j-1].strip).downcase
|
217
|
+
if rawdata[j] == ?> #
|
218
|
+
j += 1
|
219
|
+
end
|
220
|
+
finish_endtag(tag)
|
221
|
+
return j
|
222
|
+
end
|
223
|
+
|
224
|
+
def finish_starttag(tag, attrs)
|
225
|
+
method = 'start_' + tag
|
226
|
+
if self.respond_to?(method)
|
227
|
+
@stack << tag
|
228
|
+
handle_starttag(tag, method, attrs)
|
229
|
+
return 1
|
230
|
+
else
|
231
|
+
method = 'do_' + tag
|
232
|
+
if self.respond_to?(method)
|
233
|
+
handle_starttag(tag, method, attrs)
|
234
|
+
return 0
|
235
|
+
else
|
236
|
+
unknown_starttag(tag, attrs)
|
237
|
+
return -1
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def finish_endtag(tag)
|
243
|
+
if tag == ''
|
244
|
+
found = @stack.length - 1
|
245
|
+
if found < 0
|
246
|
+
unknown_endtag(tag)
|
247
|
+
return
|
248
|
+
end
|
249
|
+
else
|
250
|
+
unless @stack.include? tag
|
251
|
+
method = 'end_' + tag
|
252
|
+
unless self.respond_to?(method)
|
253
|
+
unknown_endtag(tag)
|
254
|
+
end
|
255
|
+
return
|
256
|
+
end
|
257
|
+
found = @stack.index(tag) #or @stack.length
|
258
|
+
end
|
259
|
+
while @stack.length > found
|
260
|
+
tag = @stack[-1]
|
261
|
+
method = 'end_' + tag
|
262
|
+
if respond_to?(method)
|
263
|
+
handle_endtag(tag, method)
|
264
|
+
else
|
265
|
+
unknown_endtag(tag)
|
266
|
+
end
|
267
|
+
@stack.pop
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
def parse_special(i)
|
272
|
+
rawdata = @rawdata
|
273
|
+
match = rawdata.index(Endbracket, i+1)
|
274
|
+
return nil unless match
|
275
|
+
matched_length = $&.length
|
276
|
+
handle_special(rawdata[i+1..(match-1)])
|
277
|
+
return match - i + matched_length
|
278
|
+
end
|
279
|
+
|
280
|
+
def handle_starttag(tag, method, attrs)
|
281
|
+
self.send(method, attrs)
|
282
|
+
end
|
283
|
+
|
284
|
+
def handle_endtag(tag, method)
|
285
|
+
self.send(method)
|
286
|
+
end
|
287
|
+
|
288
|
+
def report_unbalanced(tag)
|
289
|
+
if @verbose
|
290
|
+
print '*** Unbalanced </' + tag + '>', "\n"
|
291
|
+
print '*** Stack:', self.stack, "\n"
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
def handle_charref(name)
|
296
|
+
n = name.to_i
|
297
|
+
if !(0 <= n && n <= 255)
|
298
|
+
unknown_charref(name)
|
299
|
+
return
|
300
|
+
end
|
301
|
+
handle_data(n.chr)
|
302
|
+
end
|
303
|
+
|
304
|
+
def handle_entityref(name)
|
305
|
+
table = Entitydefs
|
306
|
+
if table.include?(name)
|
307
|
+
handle_data(table[name])
|
308
|
+
else
|
309
|
+
unknown_entityref(name)
|
310
|
+
return
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
def handle_data(data)
|
315
|
+
end
|
316
|
+
|
317
|
+
def handle_comment(data)
|
318
|
+
end
|
319
|
+
|
320
|
+
def handle_special(data)
|
321
|
+
end
|
322
|
+
|
323
|
+
def unknown_starttag(tag, attrs)
|
324
|
+
end
|
325
|
+
def unknown_endtag(tag)
|
326
|
+
end
|
327
|
+
def unknown_charref(ref)
|
328
|
+
end
|
329
|
+
def unknown_entityref(ref)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'feedparser'
|
2
|
+
require 'feedparser/html2text-parser'
|
3
|
+
require 'feedparser/filesizes'
|
4
|
+
|
5
|
+
class String
|
6
|
+
# Convert an HTML text to plain text
|
7
|
+
def html2text(wrapto = false)
|
8
|
+
text = self.clone
|
9
|
+
# parse HTML
|
10
|
+
p = FeedParser::HTML2TextParser::new(true)
|
11
|
+
p.feed(text)
|
12
|
+
p.close
|
13
|
+
text = p.savedata
|
14
|
+
# remove leading and trailing whilespace
|
15
|
+
text.gsub!(/\A\s*/m, '')
|
16
|
+
text.gsub!(/\s*\Z/m, '')
|
17
|
+
# remove whitespace around \n
|
18
|
+
text.gsub!(/ *\n/m, "\n")
|
19
|
+
text.gsub!(/\n */m, "\n")
|
20
|
+
# and duplicates \n
|
21
|
+
text.gsub!(/\n\n+/m, "\n\n")
|
22
|
+
# and remove duplicated whitespace
|
23
|
+
text.gsub!(/[ \t]+/, ' ')
|
24
|
+
|
25
|
+
# finally, wrap the text if requested
|
26
|
+
return wrap_text(text, wrapto) if wrapto
|
27
|
+
text
|
28
|
+
end
|
29
|
+
|
30
|
+
def wrap_text(text, wrapto = 72)
|
31
|
+
text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
module FeedParser
|
36
|
+
class Feed
|
37
|
+
def to_text(localtime = true, wrapto = false)
|
38
|
+
s = ''
|
39
|
+
s += "Type: #{@type}\n"
|
40
|
+
s += "Encoding: #{@encoding}\n"
|
41
|
+
s += "Title: #{@title}\n"
|
42
|
+
s += "Link: #{@link}\n"
|
43
|
+
if @description
|
44
|
+
s += "Description: #{@description.html2text}\n"
|
45
|
+
else
|
46
|
+
s += "Description:\n"
|
47
|
+
end
|
48
|
+
s += "Creator: #{@creator}\n"
|
49
|
+
s += "\n"
|
50
|
+
@items.each do |i|
|
51
|
+
s += '*' * 40 + "\n"
|
52
|
+
s += i.to_text(localtime, wrapto)
|
53
|
+
end
|
54
|
+
s
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class FeedItem
|
59
|
+
def to_text(localtime = true, wrapto = false, header = true)
|
60
|
+
s = ""
|
61
|
+
if header
|
62
|
+
s += "Item: "
|
63
|
+
s += @title if @title
|
64
|
+
s += "\n<#{@link}>" if @link
|
65
|
+
if @date
|
66
|
+
if localtime
|
67
|
+
s += "\nDate: #{@date.to_s}"
|
68
|
+
else
|
69
|
+
s += "\nDate: #{@date.getutc.to_s}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
s += "\n"
|
73
|
+
else
|
74
|
+
s += "<#{@link}>\n\n" if @link
|
75
|
+
end
|
76
|
+
s += "#{@content.html2text(wrapto).chomp}\n" if @content
|
77
|
+
if @enclosures and @enclosures.length > 0
|
78
|
+
s += "\nFiles:"
|
79
|
+
@enclosures.each do |e|
|
80
|
+
s += "\n #{e[0]} (#{e[1].to_i.to_human_readable}, #{e[2]})"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
if not header
|
84
|
+
s += "-- "
|
85
|
+
end
|
86
|
+
s += "\nFeed: "
|
87
|
+
s += @feed.title if @feed.title
|
88
|
+
s += "\n<#{@feed.link}>" if @feed.link
|
89
|
+
if not header
|
90
|
+
s += "\nItem: "
|
91
|
+
s += @title if @title
|
92
|
+
s += "\n<#{@link}>" if @link
|
93
|
+
if @date
|
94
|
+
if localtime
|
95
|
+
s += "\nDate: #{@date.to_s}"
|
96
|
+
else
|
97
|
+
s += "\nDate: #{@date.getutc.to_s}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
s += "\nAuthor: #{creator}" if creator
|
102
|
+
s += "\nSubject: #{@subject}" if @subject
|
103
|
+
s += "\nFiled under: #{@categories.join(', ')}" unless @categories.empty?
|
104
|
+
s += "\n" # final newline, for compat with history
|
105
|
+
s
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|