rubyful_soup 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,5 +1,15 @@
1
1
  Rubyful Soup Changelog
2
2
 
3
+ 1.0.4
4
+
5
+ Major performance improvements: the code is now over three times
6
+ faster. It's still relatively slower than Beautiful Soup, but it's no
7
+ longer unusably slow on large documents.
8
+
9
+ You can now tell the parser to only parse certain tags (and their
10
+ recursive contents). This can mean a much smaller memory footprint if
11
+ all you care about are the A tags or whatever.
12
+
3
13
  1.0.3
4
14
 
5
15
  Minor bugfixes to handle more types of data. Still more changes to use
@@ -1,7 +1,7 @@
1
1
  #Rubyful Soup
2
2
  #Elixir and Tonic
3
3
  #"The Screen-Scraper's Friend"
4
- #v1.0.3
4
+ #v1.0.4
5
5
  #http://www.crummy.com/software/RubyfulSoup/
6
6
  #
7
7
  #Rubyful Soup is a port to the Ruby language and idiom of the Python
@@ -12,6 +12,7 @@
12
12
  #Maebashi. The easiest way to get it is to install the "htmltools"
13
13
  #gem.
14
14
  require 'html/sgml-parser'
15
+ require 'set'
15
16
 
16
17
  #UTF-8 voodoo--does this really work?
17
18
  $KCODE = 'u'
@@ -213,16 +214,33 @@ module TagModule
213
214
  # return methods('find_all').to_proc
214
215
  #end
215
216
 
216
- def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
217
+ def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
217
218
  @hidden = false
218
219
  @parser = parser
219
220
  @name = name
220
- attrs ||= {}
221
- @attrs = attrs
221
+ @attr_list = attr_list
222
+ @attrs = nil
222
223
  @contents = []
223
224
  setup(parent, previous)
224
225
  end
225
226
 
227
+ # Turn the list of attributes into a hash on demand, so we don't have
228
+ # to do it for every tag while parsing.
229
+
230
+ def attrs
231
+ unless @attrs
232
+ @attrs = @attr_list.inject({}) do |m,v|
233
+ if v[1][0] == ?" and v[1][-1] == ?":
234
+ v[1] = v[1][1..-2]
235
+ end
236
+ m[v[0]] = v[1]
237
+ m
238
+ end
239
+ @attr_list = nil
240
+ end
241
+ return @attrs
242
+ end
243
+
226
244
  #soup.title_tag, or soup.title, is the same as soup.find('title')
227
245
  def method_missing(name, *args)
228
246
  #puts "Missing method #{name} for #{self.class.name}"
@@ -233,25 +251,22 @@ module TagModule
233
251
  return find(name, *args)
234
252
  end
235
253
 
236
- #TODO: is there a mixin for Hash?
237
254
  def [](k)
238
- return @attrs[k]
255
+ attrs[k]
239
256
  end
240
257
 
241
258
  def []=(k, v)
242
- @attrs[k] = v
259
+ attrs[k] = v
243
260
  end
244
261
 
245
262
  def delete(k)
246
- @attrs.delete(k)
263
+ attrs.delete(k)
247
264
  end
248
265
 
249
266
  def has_key?(k)
250
- return @attrs.has_key(k)
267
+ attrs.has_key(k)
251
268
  end
252
269
 
253
- #End things that would go away if there was a mixin for Hash.
254
-
255
270
  def each
256
271
  @contents.each { |x| yield x }
257
272
  end
@@ -287,8 +302,8 @@ module TagModule
287
302
  #consumes whitespace, this method is not certain to reproduce the
288
303
  #whitespace present in the original string.
289
304
  def to_s(show_structure_indent=nil)
290
- attrs = []
291
- @attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
305
+ attr_strings = []
306
+ attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
292
307
  if self_closing?
293
308
  close = ' /'
294
309
  closeTag = nil
@@ -307,8 +322,8 @@ module TagModule
307
322
  else
308
323
  s = []
309
324
  attribute_string = ''
310
- unless attrs.empty?
311
- attribute_string = ' ' + attrs.join(' ')
325
+ unless attr_strings.empty?
326
+ attribute_string = ' ' + attr_strings.join(' ')
312
327
  end
313
328
  s.push(space) if show_structure_indent
314
329
  s.push("<#{@name}#{attribute_string}#{close}>")
@@ -489,11 +504,22 @@ class BeautifulStoneSoup < HTML::SGMLParser
489
504
  @@self_closing_tags.has_key?(tag)
490
505
  end
491
506
 
492
- #Args: :initial_text_is_everything, :avoid_parser_problems
507
+ #Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
493
508
  def initialize(text, args={})
494
509
  super(self, @@rootTagName)
495
510
  @quote_stack = []
496
511
  @hidden = 1
512
+ if args[:parse_only_these]
513
+ @parse_only_these = Set.new
514
+ p = args[:parse_only_these]
515
+ if p.respond_to? :each
516
+ p.each { |x| @parse_only_these << x }
517
+ else
518
+ @parse_only_these << p
519
+ end
520
+ else
521
+ @parse_only_these = nil
522
+ end
497
523
  reset
498
524
 
499
525
  @avoid_parser_problems = args[:avoid_parser_problems] || true
@@ -522,7 +548,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
522
548
  end
523
549
 
524
550
  def ==(anObject)
525
- return anObject.to_s == to_s
551
+ return anObject != nil && anObject.to_s == to_s
526
552
  end
527
553
 
528
554
  def done
@@ -564,13 +590,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
564
590
 
565
591
  def unknown_starttag(name, attrs)
566
592
  #puts "Starting tag #{name} #{attrs.inspect}"
567
- attrs = attrs.inject({}) do |m,v|
568
- if v[1][0] == ?" and v[1][-1] == ?":
569
- v[1] = v[1][1..-2]
570
- end
571
- m[v[0]] = v[1]
572
- m
573
- end
593
+
574
594
  unless @quote_stack.empty?
575
595
  #This is not a real tag.
576
596
  #puts "<#{name}> is not real!"
@@ -582,6 +602,8 @@ class BeautifulStoneSoup < HTML::SGMLParser
582
602
  end
583
603
 
584
604
  end_text
605
+
606
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
585
607
  self_closing = @@self_closing_tags.has_key?(name)
586
608
  smart_pop(name) unless self_closing
587
609
  tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
@@ -608,12 +630,16 @@ class BeautifulStoneSoup < HTML::SGMLParser
608
630
  handle_data('</#{name}>')
609
631
  return
610
632
  end
633
+
634
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
635
+
611
636
  end_text
612
637
  pop_to_tag(name)
613
638
  @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
614
639
  end
615
640
 
616
641
  def handle_data(data)
642
+ return unless !@parse_only_these or @tag_stack.size > 1
617
643
  @currentText.push(data)
618
644
  end
619
645
 
@@ -712,7 +738,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
712
738
  #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
713
739
  num_pops = 0
714
740
  mostRecentTag = nil
715
- (0...@tag_stack.length).to_a.reverse.each do |i|
741
+ (@tag_stack.length-1).downto(0) do |i|
716
742
  if name == @tag_stack[i].name
717
743
  #puts "Found at #{i}, #{@tag_stack.length-i}"
718
744
  num_pops = @tag_stack.length-i
@@ -748,7 +774,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
748
774
  is_reset_nesting = @@reset_nesting_tags.has_key?(name)
749
775
  popTo = nil
750
776
  inclusive = true
751
- for p in @tag_stack.reverse
777
+ @tag_stack.reverse_each do |p|
752
778
  if (p == nil or p.name == name) and not is_nestable
753
779
  #Non-nestable tags get popped to the top or to their
754
780
  #last occurance.
@@ -391,6 +391,16 @@ class SOAPMeUp < SoupTest
391
391
  end
392
392
  end
393
393
 
394
+ # Verifies that you can decide not to parse certain tags.
395
+ class OnlyTheLonely < SoupTest
396
+ def test_parse_only_these
397
+ html = "<a>1<b>2</b>3</a><b>4<a>5</a>6</b>"
398
+ soup = BeautifulStoneSoup.new(html, :parse_only_these=>'b')
399
+ puts soup
400
+ assert_equal(soup.to_s, "<b>2</b><b>4<a>5</a>6</b>")
401
+ end
402
+ end
403
+
394
404
  #The Unicode test suite has not yet been ported because I haven't
395
405
  #figured out how Ruby does Unicode.
396
406
 
metadata CHANGED
@@ -3,50 +3,51 @@ rubygems_version: 0.8.4
3
3
  specification_version: 1
4
4
  name: rubyful_soup
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.0.3
7
- date: 2005-11-04
6
+ version: 1.0.4
7
+ date: 2006-03-01
8
8
  summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
9
9
  require_paths:
10
- - lib
10
+ - lib
11
11
  email: leonardr@segfault.org
12
12
  homepage: http://www.crummy.com/software/RubyfulSoup/
13
13
  rubyforge_project:
14
- description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
15
- bad markup, and it's easy to locate the part of a document you want."
14
+ description: Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on bad markup, and it's easy to locate the part of a document you want.
16
15
  autorequire:
17
16
  default_executable:
18
17
  bindir: bin
19
18
  has_rdoc: true
20
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
21
20
  requirements:
22
- -
23
- - ">"
24
- - !ruby/object:Gem::Version
25
- version: 0.0.0
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
26
24
  version:
27
25
  platform: ruby
28
26
  authors:
29
- - Leonard Richardson
27
+ - Leonard Richardson
30
28
  files:
31
- - lib/rubyful_soup.rb
32
- - tests/rubyful_soup_tests.rb
33
- - CHANGELOG
29
+ - lib/rubyful_soup.rb
30
+ - tests/rubyful_soup_tests.rb
31
+ - CHANGELOG
34
32
  test_files:
35
- - tests/rubyful_soup_tests.rb
33
+ - tests/rubyful_soup_tests.rb
36
34
  rdoc_options: []
35
+
37
36
  extra_rdoc_files:
38
- - CHANGELOG
37
+ - CHANGELOG
39
38
  executables: []
39
+
40
40
  extensions: []
41
+
41
42
  requirements: []
43
+
42
44
  dependencies:
43
- - !ruby/object:Gem::Dependency
44
- name: htmltools
45
- version_requirement:
46
- version_requirements: !ruby/object:Gem::Version::Requirement
47
- requirements:
48
- -
49
- - ">"
50
- - !ruby/object:Gem::Version
51
- version: 0.0.0
52
- version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: htmltools
47
+ version_requirement:
48
+ version_requirements: !ruby/object:Gem::Version::Requirement
49
+ requirements:
50
+ - - ">"
51
+ - !ruby/object:Gem::Version
52
+ version: 0.0.0
53
+ version: