penso-feedparser 0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,28 @@
1
+ require 'feedparser/textconverters'
2
+
3
+ # Patch for REXML
4
+ # Very ugly patch to make REXML error-proof.
5
+ # The problem is REXML uses IConv, which isn't error-proof at all.
6
+ # With those changes, it uses unpack/pack with some error handling
7
+ module REXML
8
+ module Encoding
9
+ def decode(str)
10
+ return str.toUTF8(@encoding)
11
+ end
12
+
13
+ def encode(str)
14
+ return str
15
+ end
16
+
17
+ def encoding=(enc)
18
+ return if defined? @encoding and enc == @encoding
19
+ @encoding = enc || 'utf-8'
20
+ end
21
+ end
22
+
23
+ class Element
24
+ def children
25
+ @children
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,332 @@
1
+ # A parser for SGML, using the derived class as static DTD.
2
+ # from http://raa.ruby-lang.org/project/html-parser
3
+ module FeedParser
4
+ class SGMLParser
5
+ # Regular expressions used for parsing:
6
+ Interesting = /[&<]/
7
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
8
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
9
+ '![^<>]*)?')
10
+
11
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
12
+ Charref = /&#([0-9]+);/
13
+
14
+ Starttagopen = /<[>a-zA-Z]/
15
+ Endtagopen = /<\/[<>a-zA-Z]/
16
+ Endbracket = /[<>]/
17
+ Special = /<![^<>]*>/
18
+ Commentopen = /<!--/
19
+ Commentclose = /--[ \t\n]*>/
20
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
21
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
22
+ '(\s*=\s*' +
23
+ "('[^']*'" +
24
+ '|"[^"]*"' +
25
+ '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
26
+
27
+ Entitydefs =
28
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
29
+
30
+ def initialize(verbose=false)
31
+ @verbose = verbose
32
+ reset
33
+ end
34
+
35
+ def reset
36
+ @rawdata = ''
37
+ @stack = []
38
+ @lasttag = '???'
39
+ @nomoretags = false
40
+ @literal = false
41
+ end
42
+
43
+ def has_context(gi)
44
+ @stack.include? gi
45
+ end
46
+
47
+ def setnomoretags
48
+ @nomoretags = true
49
+ @literal = true
50
+ end
51
+
52
+ def setliteral(*args)
53
+ @literal = true
54
+ end
55
+
56
+ def feed(data)
57
+ @rawdata << data
58
+ goahead(false)
59
+ end
60
+
61
+ def close
62
+ goahead(true)
63
+ end
64
+
65
+ def goahead(_end)
66
+ rawdata = @rawdata
67
+ i = 0
68
+ n = rawdata.length
69
+ while i < n
70
+ if @nomoretags
71
+ handle_data(rawdata[i..(n-1)])
72
+ i = n
73
+ break
74
+ end
75
+ j = rawdata.index(Interesting, i)
76
+ j = n unless j
77
+ if i < j
78
+ handle_data(rawdata[i..(j-1)])
79
+ end
80
+ i = j
81
+ break if (i == n)
82
+ if rawdata[i] == ?< #
83
+ if rawdata.index(Starttagopen, i) == i
84
+ if @literal
85
+ handle_data(rawdata[i, 1])
86
+ i += 1
87
+ next
88
+ end
89
+ k = parse_starttag(i)
90
+ break unless k
91
+ i = k
92
+ next
93
+ end
94
+ if rawdata.index(Endtagopen, i) == i
95
+ k = parse_endtag(i)
96
+ break unless k
97
+ i = k
98
+ @literal = false
99
+ next
100
+ end
101
+ if rawdata.index(Commentopen, i) == i
102
+ if @literal
103
+ handle_data(rawdata[i,1])
104
+ i += 1
105
+ next
106
+ end
107
+ k = parse_comment(i)
108
+ break unless k
109
+ i += k
110
+ next
111
+ end
112
+ if rawdata.index(Special, i) == i
113
+ if @literal
114
+ handle_data(rawdata[i, 1])
115
+ i += 1
116
+ next
117
+ end
118
+ k = parse_special(i)
119
+ break unless k
120
+ i += k
121
+ next
122
+ end
123
+ elsif rawdata[i] == ?& #
124
+ if rawdata.index(Charref, i) == i
125
+ i += $&.length
126
+ handle_charref($1)
127
+ i -= 1 unless rawdata[i-1] == ?;
128
+ next
129
+ end
130
+ if rawdata.index(Entityref, i) == i
131
+ i += $&.length
132
+ handle_entityref($1)
133
+ i -= 1 unless rawdata[i-1] == ?;
134
+ next
135
+ end
136
+ else
137
+ raise RuntimeError, 'neither < nor & ??'
138
+ end
139
+ # We get here only if incomplete matches but
140
+ # nothing else
141
+ match = rawdata.index(Incomplete, i)
142
+ unless match == i
143
+ handle_data(rawdata[i, 1])
144
+ i += 1
145
+ next
146
+ end
147
+ j = match + $&.length
148
+ break if j == n # Really incomplete
149
+ handle_data(rawdata[i..(j-1)])
150
+ i = j
151
+ end
152
+ # end while
153
+ if _end and i < n
154
+ handle_data(@rawdata[i..(n-1)])
155
+ i = n
156
+ end
157
+ @rawdata = rawdata[i..-1]
158
+ end
159
+
160
+ def parse_comment(i)
161
+ rawdata = @rawdata
162
+ if rawdata[i, 4] != '<!--'
163
+ raise RuntimeError, 'unexpected call to handle_comment'
164
+ end
165
+ match = rawdata.index(Commentclose, i)
166
+ return nil unless match
167
+ matched_length = $&.length
168
+ j = match
169
+ handle_comment(rawdata[i+4..(j-1)])
170
+ j = match + matched_length
171
+ return j-i
172
+ end
173
+
174
+ def parse_starttag(i)
175
+ rawdata = @rawdata
176
+ j = rawdata.index(Endbracket, i + 1)
177
+ return nil unless j
178
+ attrs = []
179
+ if rawdata[i+1] == ?> #
180
+ # SGML shorthand: <> == <last open tag seen>
181
+ k = j
182
+ tag = @lasttag
183
+ else
184
+ match = rawdata.index(Tagfind, i + 1)
185
+ unless match
186
+ raise RuntimeError, 'unexpected call to parse_starttag'
187
+ end
188
+ k = i + 1 + ($&.length)
189
+ tag = $&.downcase
190
+ @lasttag = tag
191
+ end
192
+ while k < j
193
+ break unless rawdata.index(Attrfind, k)
194
+ matched_length = $&.length
195
+ attrname, rest, attrvalue = $1, $2, $3
196
+ if not rest
197
+ attrvalue = '' # was: = attrname
198
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
199
+ (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
200
+ attrvalue = attrvalue[1..-2]
201
+ end
202
+ attrs << [attrname.downcase, attrvalue]
203
+ k += matched_length
204
+ end
205
+ if rawdata[j] == ?> #
206
+ j += 1
207
+ end
208
+ finish_starttag(tag, attrs)
209
+ return j
210
+ end
211
+
212
+ def parse_endtag(i)
213
+ rawdata = @rawdata
214
+ j = rawdata.index(Endbracket, i + 1)
215
+ return nil unless j
216
+ tag = (rawdata[i+2..j-1].strip).downcase
217
+ if rawdata[j] == ?> #
218
+ j += 1
219
+ end
220
+ finish_endtag(tag)
221
+ return j
222
+ end
223
+
224
+ def finish_starttag(tag, attrs)
225
+ method = 'start_' + tag
226
+ if self.respond_to?(method)
227
+ @stack << tag
228
+ handle_starttag(tag, method, attrs)
229
+ return 1
230
+ else
231
+ method = 'do_' + tag
232
+ if self.respond_to?(method)
233
+ handle_starttag(tag, method, attrs)
234
+ return 0
235
+ else
236
+ unknown_starttag(tag, attrs)
237
+ return -1
238
+ end
239
+ end
240
+ end
241
+
242
+ def finish_endtag(tag)
243
+ if tag == ''
244
+ found = @stack.length - 1
245
+ if found < 0
246
+ unknown_endtag(tag)
247
+ return
248
+ end
249
+ else
250
+ unless @stack.include? tag
251
+ method = 'end_' + tag
252
+ unless self.respond_to?(method)
253
+ unknown_endtag(tag)
254
+ end
255
+ return
256
+ end
257
+ found = @stack.index(tag) #or @stack.length
258
+ end
259
+ while @stack.length > found
260
+ tag = @stack[-1]
261
+ method = 'end_' + tag
262
+ if respond_to?(method)
263
+ handle_endtag(tag, method)
264
+ else
265
+ unknown_endtag(tag)
266
+ end
267
+ @stack.pop
268
+ end
269
+ end
270
+
271
+ def parse_special(i)
272
+ rawdata = @rawdata
273
+ match = rawdata.index(Endbracket, i+1)
274
+ return nil unless match
275
+ matched_length = $&.length
276
+ handle_special(rawdata[i+1..(match-1)])
277
+ return match - i + matched_length
278
+ end
279
+
280
+ def handle_starttag(tag, method, attrs)
281
+ self.send(method, attrs)
282
+ end
283
+
284
+ def handle_endtag(tag, method)
285
+ self.send(method)
286
+ end
287
+
288
+ def report_unbalanced(tag)
289
+ if @verbose
290
+ print '*** Unbalanced </' + tag + '>', "\n"
291
+ print '*** Stack:', self.stack, "\n"
292
+ end
293
+ end
294
+
295
+ def handle_charref(name)
296
+ n = name.to_i
297
+ if !(0 <= n && n <= 255)
298
+ unknown_charref(name)
299
+ return
300
+ end
301
+ handle_data(n.chr)
302
+ end
303
+
304
+ def handle_entityref(name)
305
+ table = Entitydefs
306
+ if table.include?(name)
307
+ handle_data(table[name])
308
+ else
309
+ unknown_entityref(name)
310
+ return
311
+ end
312
+ end
313
+
314
+ def handle_data(data)
315
+ end
316
+
317
+ def handle_comment(data)
318
+ end
319
+
320
+ def handle_special(data)
321
+ end
322
+
323
+ def unknown_starttag(tag, attrs)
324
+ end
325
+ def unknown_endtag(tag)
326
+ end
327
+ def unknown_charref(ref)
328
+ end
329
+ def unknown_entityref(ref)
330
+ end
331
+ end
332
+ end
@@ -0,0 +1,108 @@
1
+ require 'feedparser'
2
+ require 'feedparser/html2text-parser'
3
+ require 'feedparser/filesizes'
4
+
5
+ class String
6
+ # Convert an HTML text to plain text
7
+ def html2text(wrapto = false)
8
+ text = self.clone
9
+ # parse HTML
10
+ p = FeedParser::HTML2TextParser::new(true)
11
+ p.feed(text)
12
+ p.close
13
+ text = p.savedata
14
+ # remove leading and trailing whilespace
15
+ text.gsub!(/\A\s*/m, '')
16
+ text.gsub!(/\s*\Z/m, '')
17
+ # remove whitespace around \n
18
+ text.gsub!(/ *\n/m, "\n")
19
+ text.gsub!(/\n */m, "\n")
20
+ # and duplicates \n
21
+ text.gsub!(/\n\n+/m, "\n\n")
22
+ # and remove duplicated whitespace
23
+ text.gsub!(/[ \t]+/, ' ')
24
+
25
+ # finally, wrap the text if requested
26
+ return wrap_text(text, wrapto) if wrapto
27
+ text
28
+ end
29
+
30
+ def wrap_text(text, wrapto = 72)
31
+ text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n")
32
+ end
33
+ end
34
+
35
+ module FeedParser
36
+ class Feed
37
+ def to_text(localtime = true, wrapto = false)
38
+ s = ''
39
+ s += "Type: #{@type}\n"
40
+ s += "Encoding: #{@encoding}\n"
41
+ s += "Title: #{@title}\n"
42
+ s += "Link: #{@link}\n"
43
+ if @description
44
+ s += "Description: #{@description.html2text}\n"
45
+ else
46
+ s += "Description:\n"
47
+ end
48
+ s += "Creator: #{@creator}\n"
49
+ s += "\n"
50
+ @items.each do |i|
51
+ s += '*' * 40 + "\n"
52
+ s += i.to_text(localtime, wrapto)
53
+ end
54
+ s
55
+ end
56
+ end
57
+
58
+ class FeedItem
59
+ def to_text(localtime = true, wrapto = false, header = true)
60
+ s = ""
61
+ if header
62
+ s += "Item: "
63
+ s += @title if @title
64
+ s += "\n<#{@link}>" if @link
65
+ if @date
66
+ if localtime
67
+ s += "\nDate: #{@date.to_s}"
68
+ else
69
+ s += "\nDate: #{@date.getutc.to_s}"
70
+ end
71
+ end
72
+ s += "\n"
73
+ else
74
+ s += "<#{@link}>\n\n" if @link
75
+ end
76
+ s += "#{@content.html2text(wrapto).chomp}\n" if @content
77
+ if @enclosures and @enclosures.length > 0
78
+ s += "\nFiles:"
79
+ @enclosures.each do |e|
80
+ s += "\n #{e[0]} (#{e[1].to_i.to_human_readable}, #{e[2]})"
81
+ end
82
+ end
83
+ if not header
84
+ s += "-- "
85
+ end
86
+ s += "\nFeed: "
87
+ s += @feed.title if @feed.title
88
+ s += "\n<#{@feed.link}>" if @feed.link
89
+ if not header
90
+ s += "\nItem: "
91
+ s += @title if @title
92
+ s += "\n<#{@link}>" if @link
93
+ if @date
94
+ if localtime
95
+ s += "\nDate: #{@date.to_s}"
96
+ else
97
+ s += "\nDate: #{@date.getutc.to_s}"
98
+ end
99
+ end
100
+ end
101
+ s += "\nAuthor: #{creator}" if creator
102
+ s += "\nSubject: #{@subject}" if @subject
103
+ s += "\nFiled under: #{@categories.join(', ')}" unless @categories.empty?
104
+ s += "\n" # final newline, for compat with history
105
+ s
106
+ end
107
+ end
108
+ end