rubyful_soup 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +10 -0
- data/lib/rubyful_soup.rb +52 -26
- data/tests/rubyful_soup_tests.rb +10 -0
- metadata +26 -25
data/CHANGELOG
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
Rubyful Soup Changelog
|
2
2
|
|
3
|
+
1.0.4
|
4
|
+
|
5
|
+
Major performance improvements: the code is now over three times
|
6
|
+
faster. It's still relatively slower than Beautiful Soup, but it's no
|
7
|
+
longer unusably slow on large documents.
|
8
|
+
|
9
|
+
You can now tell the parser to only parse certain tags (and their
|
10
|
+
recursive contents). This can mean a much smaller memory footprint if
|
11
|
+
all you care about are the A tags or whatever.
|
12
|
+
|
3
13
|
1.0.3
|
4
14
|
|
5
15
|
Minor bugfixes to handle more types of data. Still more changes to use
|
data/lib/rubyful_soup.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#Rubyful Soup
|
2
2
|
#Elixir and Tonic
|
3
3
|
#"The Screen-Scraper's Friend"
|
4
|
-
#v1.0.
|
4
|
+
#v1.0.4
|
5
5
|
#http://www.crummy.com/software/RubyfulSoup/
|
6
6
|
#
|
7
7
|
#Rubyful Soup is a port to the Ruby language and idiom of the Python
|
@@ -12,6 +12,7 @@
|
|
12
12
|
#Maebashi. The easiest way to get it is to install the "htmltools"
|
13
13
|
#gem.
|
14
14
|
require 'html/sgml-parser'
|
15
|
+
require 'set'
|
15
16
|
|
16
17
|
#UTF-8 voodoo--does this really work?
|
17
18
|
$KCODE = 'u'
|
@@ -213,16 +214,33 @@ module TagModule
|
|
213
214
|
# return methods('find_all').to_proc
|
214
215
|
#end
|
215
216
|
|
216
|
-
def initialize(parser, name,
|
217
|
+
def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
|
217
218
|
@hidden = false
|
218
219
|
@parser = parser
|
219
220
|
@name = name
|
220
|
-
|
221
|
-
@attrs =
|
221
|
+
@attr_list = attr_list
|
222
|
+
@attrs = nil
|
222
223
|
@contents = []
|
223
224
|
setup(parent, previous)
|
224
225
|
end
|
225
226
|
|
227
|
+
# Turn the list of attributes into a hash on demand, so we don't have
|
228
|
+
# to do it for every tag while parsing.
|
229
|
+
|
230
|
+
def attrs
|
231
|
+
unless @attrs
|
232
|
+
@attrs = @attr_list.inject({}) do |m,v|
|
233
|
+
if v[1][0] == ?" and v[1][-1] == ?":
|
234
|
+
v[1] = v[1][1..-2]
|
235
|
+
end
|
236
|
+
m[v[0]] = v[1]
|
237
|
+
m
|
238
|
+
end
|
239
|
+
@attr_list = nil
|
240
|
+
end
|
241
|
+
return @attrs
|
242
|
+
end
|
243
|
+
|
226
244
|
#soup.title_tag, or soup.title, is the same as soup.find('title')
|
227
245
|
def method_missing(name, *args)
|
228
246
|
#puts "Missing method #{name} for #{self.class.name}"
|
@@ -233,25 +251,22 @@ module TagModule
|
|
233
251
|
return find(name, *args)
|
234
252
|
end
|
235
253
|
|
236
|
-
#TODO: is there a mixin for Hash?
|
237
254
|
def [](k)
|
238
|
-
|
255
|
+
attrs[k]
|
239
256
|
end
|
240
257
|
|
241
258
|
def []=(k, v)
|
242
|
-
|
259
|
+
attrs[k] = v
|
243
260
|
end
|
244
261
|
|
245
262
|
def delete(k)
|
246
|
-
|
263
|
+
attrs.delete(k)
|
247
264
|
end
|
248
265
|
|
249
266
|
def has_key?(k)
|
250
|
-
|
267
|
+
attrs.has_key(k)
|
251
268
|
end
|
252
269
|
|
253
|
-
#End things that would go away if there was a mixin for Hash.
|
254
|
-
|
255
270
|
def each
|
256
271
|
@contents.each { |x| yield x }
|
257
272
|
end
|
@@ -287,8 +302,8 @@ module TagModule
|
|
287
302
|
#consumes whitespace, this method is not certain to reproduce the
|
288
303
|
#whitespace present in the original string.
|
289
304
|
def to_s(show_structure_indent=nil)
|
290
|
-
|
291
|
-
|
305
|
+
attr_strings = []
|
306
|
+
attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
|
292
307
|
if self_closing?
|
293
308
|
close = ' /'
|
294
309
|
closeTag = nil
|
@@ -307,8 +322,8 @@ module TagModule
|
|
307
322
|
else
|
308
323
|
s = []
|
309
324
|
attribute_string = ''
|
310
|
-
unless
|
311
|
-
attribute_string = ' ' +
|
325
|
+
unless attr_strings.empty?
|
326
|
+
attribute_string = ' ' + attr_strings.join(' ')
|
312
327
|
end
|
313
328
|
s.push(space) if show_structure_indent
|
314
329
|
s.push("<#{@name}#{attribute_string}#{close}>")
|
@@ -489,11 +504,22 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
489
504
|
@@self_closing_tags.has_key?(tag)
|
490
505
|
end
|
491
506
|
|
492
|
-
#Args: :initial_text_is_everything, :avoid_parser_problems
|
507
|
+
#Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
|
493
508
|
def initialize(text, args={})
|
494
509
|
super(self, @@rootTagName)
|
495
510
|
@quote_stack = []
|
496
511
|
@hidden = 1
|
512
|
+
if args[:parse_only_these]
|
513
|
+
@parse_only_these = Set.new
|
514
|
+
p = args[:parse_only_these]
|
515
|
+
if p.respond_to? :each
|
516
|
+
p.each { |x| @parse_only_these << x }
|
517
|
+
else
|
518
|
+
@parse_only_these << p
|
519
|
+
end
|
520
|
+
else
|
521
|
+
@parse_only_these = nil
|
522
|
+
end
|
497
523
|
reset
|
498
524
|
|
499
525
|
@avoid_parser_problems = args[:avoid_parser_problems] || true
|
@@ -522,7 +548,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
522
548
|
end
|
523
549
|
|
524
550
|
def ==(anObject)
|
525
|
-
return anObject.to_s == to_s
|
551
|
+
return anObject != nil && anObject.to_s == to_s
|
526
552
|
end
|
527
553
|
|
528
554
|
def done
|
@@ -564,13 +590,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
564
590
|
|
565
591
|
def unknown_starttag(name, attrs)
|
566
592
|
#puts "Starting tag #{name} #{attrs.inspect}"
|
567
|
-
|
568
|
-
if v[1][0] == ?" and v[1][-1] == ?":
|
569
|
-
v[1] = v[1][1..-2]
|
570
|
-
end
|
571
|
-
m[v[0]] = v[1]
|
572
|
-
m
|
573
|
-
end
|
593
|
+
|
574
594
|
unless @quote_stack.empty?
|
575
595
|
#This is not a real tag.
|
576
596
|
#puts "<#{name}> is not real!"
|
@@ -582,6 +602,8 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
582
602
|
end
|
583
603
|
|
584
604
|
end_text
|
605
|
+
|
606
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
585
607
|
self_closing = @@self_closing_tags.has_key?(name)
|
586
608
|
smart_pop(name) unless self_closing
|
587
609
|
tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
|
@@ -608,12 +630,16 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
608
630
|
handle_data('</#{name}>')
|
609
631
|
return
|
610
632
|
end
|
633
|
+
|
634
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
635
|
+
|
611
636
|
end_text
|
612
637
|
pop_to_tag(name)
|
613
638
|
@quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
|
614
639
|
end
|
615
640
|
|
616
641
|
def handle_data(data)
|
642
|
+
return unless !@parse_only_these or @tag_stack.size > 1
|
617
643
|
@currentText.push(data)
|
618
644
|
end
|
619
645
|
|
@@ -712,7 +738,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
712
738
|
#puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
|
713
739
|
num_pops = 0
|
714
740
|
mostRecentTag = nil
|
715
|
-
(
|
741
|
+
(@tag_stack.length-1).downto(0) do |i|
|
716
742
|
if name == @tag_stack[i].name
|
717
743
|
#puts "Found at #{i}, #{@tag_stack.length-i}"
|
718
744
|
num_pops = @tag_stack.length-i
|
@@ -748,7 +774,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
|
|
748
774
|
is_reset_nesting = @@reset_nesting_tags.has_key?(name)
|
749
775
|
popTo = nil
|
750
776
|
inclusive = true
|
751
|
-
|
777
|
+
@tag_stack.reverse_each do |p|
|
752
778
|
if (p == nil or p.name == name) and not is_nestable
|
753
779
|
#Non-nestable tags get popped to the top or to their
|
754
780
|
#last occurance.
|
data/tests/rubyful_soup_tests.rb
CHANGED
@@ -391,6 +391,16 @@ class SOAPMeUp < SoupTest
|
|
391
391
|
end
|
392
392
|
end
|
393
393
|
|
394
|
+
# Verifies that you can decide not to parse certain tags.
|
395
|
+
class OnlyTheLonely < SoupTest
|
396
|
+
def test_parse_only_these
|
397
|
+
html = "<a>1<b>2</b>3</a><b>4<a>5</a>6</b>"
|
398
|
+
soup = BeautifulStoneSoup.new(html, :parse_only_these=>'b')
|
399
|
+
puts soup
|
400
|
+
assert_equal(soup.to_s, "<b>2</b><b>4<a>5</a>6</b>")
|
401
|
+
end
|
402
|
+
end
|
403
|
+
|
394
404
|
#The Unicode test suite has not yet been ported because I haven't
|
395
405
|
#figured out how Ruby does Unicode.
|
396
406
|
|
metadata
CHANGED
@@ -3,50 +3,51 @@ rubygems_version: 0.8.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rubyful_soup
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.0.
|
7
|
-
date:
|
6
|
+
version: 1.0.4
|
7
|
+
date: 2006-03-01
|
8
8
|
summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
|
9
9
|
require_paths:
|
10
|
-
|
10
|
+
- lib
|
11
11
|
email: leonardr@segfault.org
|
12
12
|
homepage: http://www.crummy.com/software/RubyfulSoup/
|
13
13
|
rubyforge_project:
|
14
|
-
description:
|
15
|
-
bad markup, and it's easy to locate the part of a document you want."
|
14
|
+
description: Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on bad markup, and it's easy to locate the part of a document you want.
|
16
15
|
autorequire:
|
17
16
|
default_executable:
|
18
17
|
bindir: bin
|
19
18
|
has_rdoc: true
|
20
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
20
|
requirements:
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
version: 0.0.0
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
26
24
|
version:
|
27
25
|
platform: ruby
|
28
26
|
authors:
|
29
|
-
|
27
|
+
- Leonard Richardson
|
30
28
|
files:
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
- lib/rubyful_soup.rb
|
30
|
+
- tests/rubyful_soup_tests.rb
|
31
|
+
- CHANGELOG
|
34
32
|
test_files:
|
35
|
-
|
33
|
+
- tests/rubyful_soup_tests.rb
|
36
34
|
rdoc_options: []
|
35
|
+
|
37
36
|
extra_rdoc_files:
|
38
|
-
|
37
|
+
- CHANGELOG
|
39
38
|
executables: []
|
39
|
+
|
40
40
|
extensions: []
|
41
|
+
|
41
42
|
requirements: []
|
43
|
+
|
42
44
|
dependencies:
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: htmltools
|
47
|
+
version_requirement:
|
48
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">"
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.0.0
|
53
|
+
version:
|