ruby-feedparser 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ require 'feedparser/textconverters'
2
+
3
+ # Patch for REXML
4
+ # Very ugly patch to make REXML error-proof.
5
+ # The problem is REXML uses IConv, which isn't error-proof at all.
6
+ # With those changes, it uses unpack/pack with some error handling
7
+ module REXML
8
+ module Encoding
9
+ def decode(str)
10
+ return str.toUTF8(@encoding)
11
+ end
12
+
13
+ def encode(str)
14
+ return str
15
+ end
16
+
17
+ def encoding=(enc)
18
+ return if defined? @encoding and enc == @encoding
19
+ @encoding = enc || 'utf-8'
20
+ end
21
+ end
22
+
23
+ class Element
24
+ def children
25
+ @children
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,332 @@
1
+ # A parser for SGML, using the derived class as static DTD.
2
+ # from http://raa.ruby-lang.org/project/html-parser
3
+ module FeedParser
4
+ class SGMLParser
5
+ # Regular expressions used for parsing:
6
+ Interesting = /[&<]/
7
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
8
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
9
+ '![^<>]*)?')
10
+
11
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
12
+ Charref = /&#([0-9]+);/
13
+
14
+ Starttagopen = /<[>a-zA-Z]/
15
+ Endtagopen = /<\/[<>a-zA-Z]/
16
+ Endbracket = /[<>]/
17
+ Special = /<![^<>]*>/
18
+ Commentopen = /<!--/
19
+ Commentclose = /--[ \t\n]*>/
20
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
21
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
22
+ '(\s*=\s*' +
23
+ "('[^']*'" +
24
+ '|"[^"]*"' +
25
+ '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
26
+
27
+ Entitydefs =
28
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
29
+
30
+ def initialize(verbose=false)
31
+ @verbose = verbose
32
+ reset
33
+ end
34
+
35
+ def reset
36
+ @rawdata = ''
37
+ @stack = []
38
+ @lasttag = '???'
39
+ @nomoretags = false
40
+ @literal = false
41
+ end
42
+
43
+ def has_context(gi)
44
+ @stack.include? gi
45
+ end
46
+
47
+ def setnomoretags
48
+ @nomoretags = true
49
+ @literal = true
50
+ end
51
+
52
+ def setliteral(*args)
53
+ @literal = true
54
+ end
55
+
56
+ def feed(data)
57
+ @rawdata << data
58
+ goahead(false)
59
+ end
60
+
61
+ def close
62
+ goahead(true)
63
+ end
64
+
65
+ def goahead(_end)
66
+ rawdata = @rawdata
67
+ i = 0
68
+ n = rawdata.length
69
+ while i < n
70
+ if @nomoretags
71
+ handle_data(rawdata[i..(n-1)])
72
+ i = n
73
+ break
74
+ end
75
+ j = rawdata.index(Interesting, i)
76
+ j = n unless j
77
+ if i < j
78
+ handle_data(rawdata[i..(j-1)])
79
+ end
80
+ i = j
81
+ break if (i == n)
82
+ if rawdata[i] == ?< #
83
+ if rawdata.index(Starttagopen, i) == i
84
+ if @literal
85
+ handle_data(rawdata[i, 1])
86
+ i += 1
87
+ next
88
+ end
89
+ k = parse_starttag(i)
90
+ break unless k
91
+ i = k
92
+ next
93
+ end
94
+ if rawdata.index(Endtagopen, i) == i
95
+ k = parse_endtag(i)
96
+ break unless k
97
+ i = k
98
+ @literal = false
99
+ next
100
+ end
101
+ if rawdata.index(Commentopen, i) == i
102
+ if @literal
103
+ handle_data(rawdata[i,1])
104
+ i += 1
105
+ next
106
+ end
107
+ k = parse_comment(i)
108
+ break unless k
109
+ i += k
110
+ next
111
+ end
112
+ if rawdata.index(Special, i) == i
113
+ if @literal
114
+ handle_data(rawdata[i, 1])
115
+ i += 1
116
+ next
117
+ end
118
+ k = parse_special(i)
119
+ break unless k
120
+ i += k
121
+ next
122
+ end
123
+ elsif rawdata[i] == ?& #
124
+ if rawdata.index(Charref, i) == i
125
+ i += $&.length
126
+ handle_charref($1)
127
+ i -= 1 unless rawdata[i-1] == ?;
128
+ next
129
+ end
130
+ if rawdata.index(Entityref, i) == i
131
+ i += $&.length
132
+ handle_entityref($1)
133
+ i -= 1 unless rawdata[i-1] == ?;
134
+ next
135
+ end
136
+ else
137
+ raise RuntimeError, 'neither < nor & ??'
138
+ end
139
+ # We get here only if incomplete matches but
140
+ # nothing else
141
+ match = rawdata.index(Incomplete, i)
142
+ unless match == i
143
+ handle_data(rawdata[i, 1])
144
+ i += 1
145
+ next
146
+ end
147
+ j = match + $&.length
148
+ break if j == n # Really incomplete
149
+ handle_data(rawdata[i..(j-1)])
150
+ i = j
151
+ end
152
+ # end while
153
+ if _end and i < n
154
+ handle_data(@rawdata[i..(n-1)])
155
+ i = n
156
+ end
157
+ @rawdata = rawdata[i..-1]
158
+ end
159
+
160
+ def parse_comment(i)
161
+ rawdata = @rawdata
162
+ if rawdata[i, 4] != '<!--'
163
+ raise RuntimeError, 'unexpected call to handle_comment'
164
+ end
165
+ match = rawdata.index(Commentclose, i)
166
+ return nil unless match
167
+ matched_length = $&.length
168
+ j = match
169
+ handle_comment(rawdata[i+4..(j-1)])
170
+ j = match + matched_length
171
+ return j-i
172
+ end
173
+
174
+ def parse_starttag(i)
175
+ rawdata = @rawdata
176
+ j = rawdata.index(Endbracket, i + 1)
177
+ return nil unless j
178
+ attrs = []
179
+ if rawdata[i+1] == ?> #
180
+ # SGML shorthand: <> == <last open tag seen>
181
+ k = j
182
+ tag = @lasttag
183
+ else
184
+ match = rawdata.index(Tagfind, i + 1)
185
+ unless match
186
+ raise RuntimeError, 'unexpected call to parse_starttag'
187
+ end
188
+ k = i + 1 + ($&.length)
189
+ tag = $&.downcase
190
+ @lasttag = tag
191
+ end
192
+ while k < j
193
+ break unless rawdata.index(Attrfind, k)
194
+ matched_length = $&.length
195
+ attrname, rest, attrvalue = $1, $2, $3
196
+ if not rest
197
+ attrvalue = '' # was: = attrname
198
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
199
+ (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
200
+ attrvalue = attrvalue[1..-2]
201
+ end
202
+ attrs << [attrname.downcase, attrvalue]
203
+ k += matched_length
204
+ end
205
+ if rawdata[j] == ?> #
206
+ j += 1
207
+ end
208
+ finish_starttag(tag, attrs)
209
+ return j
210
+ end
211
+
212
+ def parse_endtag(i)
213
+ rawdata = @rawdata
214
+ j = rawdata.index(Endbracket, i + 1)
215
+ return nil unless j
216
+ tag = (rawdata[i+2..j-1].strip).downcase
217
+ if rawdata[j] == ?> #
218
+ j += 1
219
+ end
220
+ finish_endtag(tag)
221
+ return j
222
+ end
223
+
224
+ def finish_starttag(tag, attrs)
225
+ method = 'start_' + tag
226
+ if self.respond_to?(method)
227
+ @stack << tag
228
+ handle_starttag(tag, method, attrs)
229
+ return 1
230
+ else
231
+ method = 'do_' + tag
232
+ if self.respond_to?(method)
233
+ handle_starttag(tag, method, attrs)
234
+ return 0
235
+ else
236
+ unknown_starttag(tag, attrs)
237
+ return -1
238
+ end
239
+ end
240
+ end
241
+
242
+ def finish_endtag(tag)
243
+ if tag == ''
244
+ found = @stack.length - 1
245
+ if found < 0
246
+ unknown_endtag(tag)
247
+ return
248
+ end
249
+ else
250
+ unless @stack.include? tag
251
+ method = 'end_' + tag
252
+ unless self.respond_to?(method)
253
+ unknown_endtag(tag)
254
+ end
255
+ return
256
+ end
257
+ found = @stack.index(tag) #or @stack.length
258
+ end
259
+ while @stack.length > found
260
+ tag = @stack[-1]
261
+ method = 'end_' + tag
262
+ if respond_to?(method)
263
+ handle_endtag(tag, method)
264
+ else
265
+ unknown_endtag(tag)
266
+ end
267
+ @stack.pop
268
+ end
269
+ end
270
+
271
+ def parse_special(i)
272
+ rawdata = @rawdata
273
+ match = rawdata.index(Endbracket, i+1)
274
+ return nil unless match
275
+ matched_length = $&.length
276
+ handle_special(rawdata[i+1..(match-1)])
277
+ return match - i + matched_length
278
+ end
279
+
280
+ def handle_starttag(tag, method, attrs)
281
+ self.send(method, attrs)
282
+ end
283
+
284
+ def handle_endtag(tag, method)
285
+ self.send(method)
286
+ end
287
+
288
+ def report_unbalanced(tag)
289
+ if @verbose
290
+ print '*** Unbalanced </' + tag + '>', "\n"
291
+ print '*** Stack:', self.stack, "\n"
292
+ end
293
+ end
294
+
295
+ def handle_charref(name)
296
+ n = name.to_i
297
+ if !(0 <= n && n <= 255)
298
+ unknown_charref(name)
299
+ return
300
+ end
301
+ handle_data(n.chr)
302
+ end
303
+
304
+ def handle_entityref(name)
305
+ table = Entitydefs
306
+ if table.include?(name)
307
+ handle_data(table[name])
308
+ else
309
+ unknown_entityref(name)
310
+ return
311
+ end
312
+ end
313
+
314
+ def handle_data(data)
315
+ end
316
+
317
+ def handle_comment(data)
318
+ end
319
+
320
+ def handle_special(data)
321
+ end
322
+
323
+ def unknown_starttag(tag, attrs)
324
+ end
325
+ def unknown_endtag(tag)
326
+ end
327
+ def unknown_charref(ref)
328
+ end
329
+ def unknown_entityref(ref)
330
+ end
331
+ end
332
+ end
@@ -0,0 +1,108 @@
1
+ require 'feedparser'
2
+ require 'feedparser/html2text-parser'
3
+ require 'feedparser/filesizes'
4
+
5
+ class String
6
+ # Convert an HTML text to plain text
7
+ def html2text(wrapto = false)
8
+ text = self.clone
9
+ # parse HTML
10
+ p = FeedParser::HTML2TextParser::new(true)
11
+ p.feed(text)
12
+ p.close
13
+ text = p.savedata
14
+ # remove leading and trailing whilespace
15
+ text.gsub!(/\A\s*/m, '')
16
+ text.gsub!(/\s*\Z/m, '')
17
+ # remove whitespace around \n
18
+ text.gsub!(/ *\n/m, "\n")
19
+ text.gsub!(/\n */m, "\n")
20
+ # and duplicates \n
21
+ text.gsub!(/\n\n+/m, "\n\n")
22
+ # and remove duplicated whitespace
23
+ text.gsub!(/[ \t]+/, ' ')
24
+
25
+ # finally, wrap the text if requested
26
+ return wrap_text(text, wrapto) if wrapto
27
+ text
28
+ end
29
+
30
+ def wrap_text(text, wrapto = 72)
31
+ text.gsub(/(.{1,#{wrapto}})( +|$)\n?/, "\\1\\2\n")
32
+ end
33
+ end
34
+
35
+ module FeedParser
36
+ class Feed
37
+ def to_text(localtime = true, wrapto = false)
38
+ s = ''
39
+ s += "Type: #{@type}\n"
40
+ s += "Encoding: #{@encoding}\n"
41
+ s += "Title: #{@title}\n"
42
+ s += "Link: #{@link}\n"
43
+ if @description
44
+ s += "Description: #{@description.html2text}\n"
45
+ else
46
+ s += "Description:\n"
47
+ end
48
+ s += "Creator: #{@creator}\n"
49
+ s += "\n"
50
+ @items.each do |i|
51
+ s += '*' * 40 + "\n"
52
+ s += i.to_text(localtime, wrapto)
53
+ end
54
+ s
55
+ end
56
+ end
57
+
58
+ class FeedItem
59
+ def to_text(localtime = true, wrapto = false, header = true)
60
+ s = ""
61
+ if header
62
+ s += "Item: "
63
+ s += @title if @title
64
+ s += "\n<#{@link}>" if @link
65
+ if @date
66
+ if localtime
67
+ s += "\nDate: #{@date.to_s}"
68
+ else
69
+ s += "\nDate: #{@date.getutc.to_s}"
70
+ end
71
+ end
72
+ s += "\n"
73
+ else
74
+ s += "<#{@link}>\n\n" if @link
75
+ end
76
+ s += "#{@content.html2text(wrapto).chomp}\n" if @content
77
+ if @enclosures and @enclosures.length > 0
78
+ s += "\nFiles:"
79
+ @enclosures.each do |e|
80
+ s += "\n #{e[0]} (#{e[1].to_i.to_human_readable}, #{e[2]})"
81
+ end
82
+ end
83
+ if not header
84
+ s += "-- "
85
+ end
86
+ s += "\nFeed: "
87
+ s += @feed.title if @feed.title
88
+ s += "\n<#{@feed.link}>" if @feed.link
89
+ if not header
90
+ s += "\nItem: "
91
+ s += @title if @title
92
+ s += "\n<#{@link}>" if @link
93
+ if @date
94
+ if localtime
95
+ s += "\nDate: #{@date.to_s}"
96
+ else
97
+ s += "\nDate: #{@date.getutc.to_s}"
98
+ end
99
+ end
100
+ end
101
+ s += "\nAuthor: #{creator}" if creator
102
+ s += "\nSubject: #{@subject}" if @subject
103
+ s += "\nFiled under: #{@categories.join(', ')}" unless @categories.empty?
104
+ s += "\n" # final newline, for compat with history
105
+ s
106
+ end
107
+ end
108
+ end