rubyful_soup 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -0
- data/lib/rubyful_soup.rb +52 -26
- data/tests/rubyful_soup_tests.rb +10 -0
- metadata +26 -25
data/CHANGELOG
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
Rubyful Soup Changelog
|
2
2
|
|
3
|
+
1.0.4
|
4
|
+
|
5
|
+
Major performance improvements: the code is now over three times
|
6
|
+
faster. It's still relatively slower than Beautiful Soup, but it's no
|
7
|
+
longer unusably slow on large documents.
|
8
|
+
|
9
|
+
You can now tell the parser to only parse certain tags (and their
|
10
|
+
recursive contents). This can mean a much smaller memory footprint if
|
11
|
+
all you care about are the A tags or whatever.
|
12
|
+
|
3
13
|
1.0.3
|
4
14
|
|
5
15
|
Minor bugfixes to handle more types of data. Still more changes to use
|
data/lib/rubyful_soup.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#Rubyful Soup
|
2
2
|
#Elixir and Tonic
|
3
3
|
#"The Screen-Scraper's Friend"
|
4
|
-
#v1.0.
|
4
|
+
#v1.0.4
|
5
5
|
#http://www.crummy.com/software/RubyfulSoup/
|
6
6
|
#
|
7
7
|
#Rubyful Soup is a port to the Ruby language and idiom of the Python
|
@@ -12,6 +12,7 @@
|
|
12
12
|
#Maebashi. The easiest way to get it is to install the "htmltools"
|
13
13
|
#gem.
|
14
14
|
require 'html/sgml-parser'
|
15
|
+
require 'set'
|
15
16
|
|
16
17
|
#UTF-8 voodoo--does this really work?
|
17
18
|
$KCODE = 'u'
|
@@ -213,16 +214,33 @@ module TagModule
|
|
213
214
|
# return methods('find_all').to_proc
|
214
215
|
#end
|
215
216
|
|
216
|
-
def initialize(parser, name,
|
217
|
+
def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
|
217
218
|
@hidden = false
|
218
219
|
@parser = parser
|
219
220
|
@name = name
|
220
|
-
|
221
|
-
@attrs =
|
221
|
+
@attr_list = attr_list
|
222
|
+
@attrs = nil
|
222
223
|
@contents = []
|
223
224
|
setup(parent, previous)
|
224
225
|
end
|
225
226
|
|
227
|
+
# Turn the list of attributes into a hash on demand, so we don't have
|
228
|
+
# to do it for every tag while parsing.
|
229
|
+
|
230
|
+
def attrs
|
231
|
+
unless @attrs
|
232
|
+
@attrs = @attr_list.inject({}) do |m,v|
|
233
|
+
if v[1][0] == ?" and v[1][-1] == ?":
|
234
|
+
v[1] = v[1][1..-2]
|
235
|
+
end
|
236
|
+
m[v[0]] = v[1]
|
237
|
+
m
|
238
|
+
end
|
239
|
+
@attr_list = nil
|
240
|
+
end
|
241
|
+
return @attrs
|
242
|
+
end
|
243
|
+
|
226
244
|
#soup.title_tag, or soup.title, is the same as soup.find('title')
|
227
245
|
def method_missing(name, *args)
|
228
246
|
#puts "Missing method #{name} for #{self.class.name}"
|
@@ -233,25 +251,22 @@ module TagModule
|
|
233
251
|
return find(name, *args)
|
234
252
|
end
|
235
253
|
|
236
|
-
#TODO: is there a mixin for Hash?
|
237
254
|
def [](k)
|
238
|
-
|
255
|
+
attrs[k]
|
239
256
|
end
|
240
257
|
|
241
258
|
def []=(k, v)
|
242
|
-
|
259
|
+
attrs[k] = v
|
243
260
|
end
|
244
261
|
|
245
262
|
def delete(k)
|
246
|
-
|
263
|
+
attrs.delete(k)
|
247
264
|
end
|
248
265
|
|
249
266
|
def has_key?(k)
|
250
|
-
|
267
|
+
attrs.has_key(k)
|
251
268
|
end
|
252
269
|
|
253
|
-
#End things that would go away if there was a mixin for Hash.
|
254
|
-
|
255
270
|
def each
|
256
271
|
@contents.each { |x| yield x }
|
257
272
|
end
|
@@ -287,8 +302,8 @@ module TagModule
|
|
287
302
|
#consumes whitespace, this method is not certain to reproduce the
|
288
303
|
#whitespace present in the original string.
|
289
304
|
def to_s(show_structure_indent=nil)
|
290
|
-
|
291
|
-
|
305
|
+
attr_strings = []
|
306
|
+
attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
|
292
307
|
if self_closing?
|
293
308
|
close = ' /'
|
294
309
|
closeTag = nil
|
@@ -307,8 +322,8 @@ module TagModule
|
|
307
322
|
else
|
308
323
|
s = []
|
309
324
|
attribute_string = ''
|
310
|
-
unless
|
311
|
-
attribute_string = ' ' +
|
325
|
+
unless attr_strings.empty?
|
326
|
+
attribute_string = ' ' + attr_strings.join(' ')
|
312
327
|
end
|
313
328
|
s.push(space) if show_structure_indent
|
314
329
|
s.push("<#{@name}#{attribute_string}#{close}>")
|
@@ -489,11 +504,22 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
489
504
|
@@self_closing_tags.has_key?(tag)
|
490
505
|
end
|
491
506
|
|
492
|
-
#Args: :initial_text_is_everything, :avoid_parser_problems
|
507
|
+
#Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
|
493
508
|
def initialize(text, args={})
|
494
509
|
super(self, @@rootTagName)
|
495
510
|
@quote_stack = []
|
496
511
|
@hidden = 1
|
512
|
+
if args[:parse_only_these]
|
513
|
+
@parse_only_these = Set.new
|
514
|
+
p = args[:parse_only_these]
|
515
|
+
if p.respond_to? :each
|
516
|
+
p.each { |x| @parse_only_these << x }
|
517
|
+
else
|
518
|
+
@parse_only_these << p
|
519
|
+
end
|
520
|
+
else
|
521
|
+
@parse_only_these = nil
|
522
|
+
end
|
497
523
|
reset
|
498
524
|
|
499
525
|
@avoid_parser_problems = args[:avoid_parser_problems] || true
|
@@ -522,7 +548,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
522
548
|
end
|
523
549
|
|
524
550
|
def ==(anObject)
|
525
|
-
return anObject.to_s == to_s
|
551
|
+
return anObject != nil && anObject.to_s == to_s
|
526
552
|
end
|
527
553
|
|
528
554
|
def done
|
@@ -564,13 +590,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
564
590
|
|
565
591
|
def unknown_starttag(name, attrs)
|
566
592
|
#puts "Starting tag #{name} #{attrs.inspect}"
|
567
|
-
|
568
|
-
if v[1][0] == ?" and v[1][-1] == ?":
|
569
|
-
v[1] = v[1][1..-2]
|
570
|
-
end
|
571
|
-
m[v[0]] = v[1]
|
572
|
-
m
|
573
|
-
end
|
593
|
+
|
574
594
|
unless @quote_stack.empty?
|
575
595
|
#This is not a real tag.
|
576
596
|
#puts "<#{name}> is not real!"
|
@@ -582,6 +602,8 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
582
602
|
end
|
583
603
|
|
584
604
|
end_text
|
605
|
+
|
606
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
585
607
|
self_closing = @@self_closing_tags.has_key?(name)
|
586
608
|
smart_pop(name) unless self_closing
|
587
609
|
tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
|
@@ -608,12 +630,16 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
608
630
|
handle_data('</#{name}>')
|
609
631
|
return
|
610
632
|
end
|
633
|
+
|
634
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
635
|
+
|
611
636
|
end_text
|
612
637
|
pop_to_tag(name)
|
613
638
|
@quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
|
614
639
|
end
|
615
640
|
|
616
641
|
def handle_data(data)
|
642
|
+
return unless !@parse_only_these or @tag_stack.size > 1
|
617
643
|
@currentText.push(data)
|
618
644
|
end
|
619
645
|
|
@@ -712,7 +738,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
712
738
|
#puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
|
713
739
|
num_pops = 0
|
714
740
|
mostRecentTag = nil
|
715
|
-
(
|
741
|
+
(@tag_stack.length-1).downto(0) do |i|
|
716
742
|
if name == @tag_stack[i].name
|
717
743
|
#puts "Found at #{i}, #{@tag_stack.length-i}"
|
718
744
|
num_pops = @tag_stack.length-i
|
@@ -748,7 +774,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
748
774
|
is_reset_nesting = @@reset_nesting_tags.has_key?(name)
|
749
775
|
popTo = nil
|
750
776
|
inclusive = true
|
751
|
-
|
777
|
+
@tag_stack.reverse_each do |p|
|
752
778
|
if (p == nil or p.name == name) and not is_nestable
|
753
779
|
#Non-nestable tags get popped to the top or to their
|
754
780
|
#last occurance.
|
data/tests/rubyful_soup_tests.rb
CHANGED
@@ -391,6 +391,16 @@ class SOAPMeUp < SoupTest
|
|
391
391
|
end
|
392
392
|
end
|
393
393
|
|
394
|
+
# Verifies that you can decide not to parse certain tags.
|
395
|
+
class OnlyTheLonely < SoupTest
|
396
|
+
def test_parse_only_these
|
397
|
+
html = "<a>1<b>2</b>3</a><b>4<a>5</a>6</b>"
|
398
|
+
soup = BeautifulStoneSoup.new(html, :parse_only_these=>'b')
|
399
|
+
puts soup
|
400
|
+
assert_equal(soup.to_s, "<b>2</b><b>4<a>5</a>6</b>")
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
394
404
|
#The Unicode test suite has not yet been ported because I haven't
|
395
405
|
#figured out how Ruby does Unicode.
|
396
406
|
|
metadata
CHANGED
@@ -3,50 +3,51 @@ rubygems_version: 0.8.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rubyful_soup
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.0.
|
7
|
-
date:
|
6
|
+
version: 1.0.4
|
7
|
+
date: 2006-03-01
|
8
8
|
summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
|
9
9
|
require_paths:
|
10
|
-
|
10
|
+
- lib
|
11
11
|
email: leonardr@segfault.org
|
12
12
|
homepage: http://www.crummy.com/software/RubyfulSoup/
|
13
13
|
rubyforge_project:
|
14
|
-
description:
|
15
|
-
bad markup, and it's easy to locate the part of a document you want."
|
14
|
+
description: Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on bad markup, and it's easy to locate the part of a document you want.
|
16
15
|
autorequire:
|
17
16
|
default_executable:
|
18
17
|
bindir: bin
|
19
18
|
has_rdoc: true
|
20
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
20
|
requirements:
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
version: 0.0.0
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
26
24
|
version:
|
27
25
|
platform: ruby
|
28
26
|
authors:
|
29
|
-
|
27
|
+
- Leonard Richardson
|
30
28
|
files:
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
- lib/rubyful_soup.rb
|
30
|
+
- tests/rubyful_soup_tests.rb
|
31
|
+
- CHANGELOG
|
34
32
|
test_files:
|
35
|
-
|
33
|
+
- tests/rubyful_soup_tests.rb
|
36
34
|
rdoc_options: []
|
35
|
+
|
37
36
|
extra_rdoc_files:
|
38
|
-
|
37
|
+
- CHANGELOG
|
39
38
|
executables: []
|
39
|
+
|
40
40
|
extensions: []
|
41
|
+
|
41
42
|
requirements: []
|
43
|
+
|
42
44
|
dependencies:
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: htmltools
|
47
|
+
version_requirement:
|
48
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">"
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.0.0
|
53
|
+
version:
|