rubyful_soup 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,5 +1,15 @@
1
1
  Rubyful Soup Changelog
2
2
 
3
+ 1.0.4
4
+
5
+ Major performance improvements: the code is now over three times
6
+ faster. It's still relatively slower than Beautiful Soup, but it's no
7
+ longer unusably slow on large documents.
8
+
9
+ You can now tell the parser to only parse certain tags (and their
10
+ recursive contents). This can mean a much smaller memory footprint if
11
+ all you care about are the A tags or whatever.
12
+
3
13
  1.0.3
4
14
 
5
15
  Minor bugfixes to handle more types of data. Still more changes to use
@@ -1,7 +1,7 @@
1
1
  #Rubyful Soup
2
2
  #Elixir and Tonic
3
3
  #"The Screen-Scraper's Friend"
4
- #v1.0.3
4
+ #v1.0.4
5
5
  #http://www.crummy.com/software/RubyfulSoup/
6
6
  #
7
7
  #Rubyful Soup is a port to the Ruby language and idiom of the Python
@@ -12,6 +12,7 @@
12
12
  #Maebashi. The easiest way to get it is to install the "htmltools"
13
13
  #gem.
14
14
  require 'html/sgml-parser'
15
+ require 'set'
15
16
 
16
17
  #UTF-8 voodoo--does this really work?
17
18
  $KCODE = 'u'
@@ -213,16 +214,33 @@ module TagModule
213
214
  # return methods('find_all').to_proc
214
215
  #end
215
216
 
216
- def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
217
+ def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
217
218
  @hidden = false
218
219
  @parser = parser
219
220
  @name = name
220
- attrs ||= {}
221
- @attrs = attrs
221
+ @attr_list = attr_list
222
+ @attrs = nil
222
223
  @contents = []
223
224
  setup(parent, previous)
224
225
  end
225
226
 
227
+ # Turn the list of attributes into a hash on demand, so we don't have
228
+ # to do it for every tag while parsing.
229
+
230
+ def attrs
231
+ unless @attrs
232
+ @attrs = @attr_list.inject({}) do |m,v|
233
+ if v[1][0] == ?" and v[1][-1] == ?":
234
+ v[1] = v[1][1..-2]
235
+ end
236
+ m[v[0]] = v[1]
237
+ m
238
+ end
239
+ @attr_list = nil
240
+ end
241
+ return @attrs
242
+ end
243
+
226
244
  #soup.title_tag, or soup.title, is the same as soup.find('title')
227
245
  def method_missing(name, *args)
228
246
  #puts "Missing method #{name} for #{self.class.name}"
@@ -233,25 +251,22 @@ module TagModule
233
251
  return find(name, *args)
234
252
  end
235
253
 
236
- #TODO: is there a mixin for Hash?
237
254
  def [](k)
238
- return @attrs[k]
255
+ attrs[k]
239
256
  end
240
257
 
241
258
  def []=(k, v)
242
- @attrs[k] = v
259
+ attrs[k] = v
243
260
  end
244
261
 
245
262
  def delete(k)
246
- @attrs.delete(k)
263
+ attrs.delete(k)
247
264
  end
248
265
 
249
266
  def has_key?(k)
250
- return @attrs.has_key(k)
267
+ attrs.has_key(k)
251
268
  end
252
269
 
253
- #End things that would go away if there was a mixin for Hash.
254
-
255
270
  def each
256
271
  @contents.each { |x| yield x }
257
272
  end
@@ -287,8 +302,8 @@ module TagModule
287
302
  #consumes whitespace, this method is not certain to reproduce the
288
303
  #whitespace present in the original string.
289
304
  def to_s(show_structure_indent=nil)
290
- attrs = []
291
- @attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
305
+ attr_strings = []
306
+ attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
292
307
  if self_closing?
293
308
  close = ' /'
294
309
  closeTag = nil
@@ -307,8 +322,8 @@ module TagModule
307
322
  else
308
323
  s = []
309
324
  attribute_string = ''
310
- unless attrs.empty?
311
- attribute_string = ' ' + attrs.join(' ')
325
+ unless attr_strings.empty?
326
+ attribute_string = ' ' + attr_strings.join(' ')
312
327
  end
313
328
  s.push(space) if show_structure_indent
314
329
  s.push("<#{@name}#{attribute_string}#{close}>")
@@ -489,11 +504,22 @@ class BeautifulStoneSoup < HTML::SGMLParser
489
504
  @@self_closing_tags.has_key?(tag)
490
505
  end
491
506
 
492
- #Args: :initial_text_is_everything, :avoid_parser_problems
507
+ #Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
493
508
  def initialize(text, args={})
494
509
  super(self, @@rootTagName)
495
510
  @quote_stack = []
496
511
  @hidden = 1
512
+ if args[:parse_only_these]
513
+ @parse_only_these = Set.new
514
+ p = args[:parse_only_these]
515
+ if p.respond_to? :each
516
+ p.each { |x| @parse_only_these << x }
517
+ else
518
+ @parse_only_these << p
519
+ end
520
+ else
521
+ @parse_only_these = nil
522
+ end
497
523
  reset
498
524
 
499
525
  @avoid_parser_problems = args[:avoid_parser_problems] || true
@@ -522,7 +548,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
522
548
  end
523
549
 
524
550
  def ==(anObject)
525
- return anObject.to_s == to_s
551
+ return anObject != nil && anObject.to_s == to_s
526
552
  end
527
553
 
528
554
  def done
@@ -564,13 +590,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
564
590
 
565
591
  def unknown_starttag(name, attrs)
566
592
  #puts "Starting tag #{name} #{attrs.inspect}"
567
- attrs = attrs.inject({}) do |m,v|
568
- if v[1][0] == ?" and v[1][-1] == ?":
569
- v[1] = v[1][1..-2]
570
- end
571
- m[v[0]] = v[1]
572
- m
573
- end
593
+
574
594
  unless @quote_stack.empty?
575
595
  #This is not a real tag.
576
596
  #puts "<#{name}> is not real!"
@@ -582,6 +602,8 @@ class BeautifulStoneSoup < HTML::SGMLParser
582
602
  end
583
603
 
584
604
  end_text
605
+
606
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
585
607
  self_closing = @@self_closing_tags.has_key?(name)
586
608
  smart_pop(name) unless self_closing
587
609
  tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
@@ -608,12 +630,16 @@ class BeautifulStoneSoup < HTML::SGMLParser
608
630
  handle_data('</#{name}>')
609
631
  return
610
632
  end
633
+
634
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
635
+
611
636
  end_text
612
637
  pop_to_tag(name)
613
638
  @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
614
639
  end
615
640
 
616
641
  def handle_data(data)
642
+ return unless !@parse_only_these or @tag_stack.size > 1
617
643
  @currentText.push(data)
618
644
  end
619
645
 
@@ -712,7 +738,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
712
738
  #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
713
739
  num_pops = 0
714
740
  mostRecentTag = nil
715
- (0...@tag_stack.length).to_a.reverse.each do |i|
741
+ (@tag_stack.length-1).downto(0) do |i|
716
742
  if name == @tag_stack[i].name
717
743
  #puts "Found at #{i}, #{@tag_stack.length-i}"
718
744
  num_pops = @tag_stack.length-i
@@ -748,7 +774,7 @@ class BeautifulStoneSoup < HTML::SGMLParser
748
774
  is_reset_nesting = @@reset_nesting_tags.has_key?(name)
749
775
  popTo = nil
750
776
  inclusive = true
751
- for p in @tag_stack.reverse
777
+ @tag_stack.reverse_each do |p|
752
778
  if (p == nil or p.name == name) and not is_nestable
753
779
  #Non-nestable tags get popped to the top or to their
754
780
  #last occurance.
@@ -391,6 +391,16 @@ class SOAPMeUp < SoupTest
391
391
  end
392
392
  end
393
393
 
394
+ # Verifies that you can decide not to parse certain tags.
395
+ class OnlyTheLonely < SoupTest
396
+ def test_parse_only_these
397
+ html = "<a>1<b>2</b>3</a><b>4<a>5</a>6</b>"
398
+ soup = BeautifulStoneSoup.new(html, :parse_only_these=>'b')
399
+ puts soup
400
+ assert_equal(soup.to_s, "<b>2</b><b>4<a>5</a>6</b>")
401
+ end
402
+ end
403
+
394
404
  #The Unicode test suite has not yet been ported because I haven't
395
405
  #figured out how Ruby does Unicode.
396
406
 
metadata CHANGED
@@ -3,50 +3,51 @@ rubygems_version: 0.8.4
3
3
  specification_version: 1
4
4
  name: rubyful_soup
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.0.3
7
- date: 2005-11-04
6
+ version: 1.0.4
7
+ date: 2006-03-01
8
8
  summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
9
9
  require_paths:
10
- - lib
10
+ - lib
11
11
  email: leonardr@segfault.org
12
12
  homepage: http://www.crummy.com/software/RubyfulSoup/
13
13
  rubyforge_project:
14
- description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
15
- bad markup, and it's easy to locate the part of a document you want."
14
+ description: Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on bad markup, and it's easy to locate the part of a document you want.
16
15
  autorequire:
17
16
  default_executable:
18
17
  bindir: bin
19
18
  has_rdoc: true
20
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
21
20
  requirements:
22
- -
23
- - ">"
24
- - !ruby/object:Gem::Version
25
- version: 0.0.0
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
26
24
  version:
27
25
  platform: ruby
28
26
  authors:
29
- - Leonard Richardson
27
+ - Leonard Richardson
30
28
  files:
31
- - lib/rubyful_soup.rb
32
- - tests/rubyful_soup_tests.rb
33
- - CHANGELOG
29
+ - lib/rubyful_soup.rb
30
+ - tests/rubyful_soup_tests.rb
31
+ - CHANGELOG
34
32
  test_files:
35
- - tests/rubyful_soup_tests.rb
33
+ - tests/rubyful_soup_tests.rb
36
34
  rdoc_options: []
35
+
37
36
  extra_rdoc_files:
38
- - CHANGELOG
37
+ - CHANGELOG
39
38
  executables: []
39
+
40
40
  extensions: []
41
+
41
42
  requirements: []
43
+
42
44
  dependencies:
43
- - !ruby/object:Gem::Dependency
44
- name: htmltools
45
- version_requirement:
46
- version_requirements: !ruby/object:Gem::Version::Requirement
47
- requirements:
48
- -
49
- - ">"
50
- - !ruby/object:Gem::Version
51
- version: 0.0.0
52
- version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: htmltools
47
+ version_requirement:
48
+ version_requirements: !ruby/object:Gem::Version::Requirement
49
+ requirements:
50
+ - - ">"
51
+ - !ruby/object:Gem::Version
52
+ version: 0.0.0
53
+ version: