scrubber-scrubyt 0.4.20 → 0.4.25

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,10 +1,15 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
3
  == 0.4.3
4
- == 20th April
4
+ == 23th April
5
5
 
6
6
  - [NEW] option to close the firefox window after the scraping is finished (thanks to Mikkel Garcia and Damien Garros)
7
+ - [NEW] Added the ability to scrape, click_link, then scrape again. (Only for firefox agent) (thanks to Mikkel Garcia)
7
8
  - [FIX] scRUBYt! now works with latest version of mechanize (thanks to nesquena, austinmoore and Leandro Nunes)
9
+ - [NEW] added a wrapper around the firewatir requirement to make firewatir optional
10
+ - [FIX] added test to prohibit traverse_from_match from attempting to traverse nil children (thanks to Dennis Sutch)
11
+
12
+
8
13
 
9
14
 
10
15
  == 0.4.05
File without changes
data/lib/scrubyt.rb CHANGED
@@ -28,7 +28,15 @@ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
28
28
  require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
29
29
  require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
30
30
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
31
- require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
31
+
32
+ # -- Making Firewatir optional --
33
+ if defined? Firewatir::Firefox
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
+ else
36
+ puts "The gem firewatir is not installed"
37
+ end
38
+ # --
39
+
32
40
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
33
41
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
34
42
  require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
@@ -103,6 +103,10 @@ module Scrubyt
103
103
  end
104
104
  sleep(wait_secs) if wait_secs > 0
105
105
  @@agent.wait
106
+
107
+ # evaluate the results
108
+ extractor.evaluate_extractor
109
+
106
110
  @@current_doc_url = @@agent.url
107
111
  @@mechanize_doc = "<html>#{@@agent.html}</html>"
108
112
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -14,6 +14,15 @@ module Scrubyt
14
14
  @@host_name = nil
15
15
  @@history = []
16
16
  @@current_form = nil
17
+ @@extractor = nil
18
+
19
+ def self.extractor=(extractor)
20
+ @@extractor = extractor
21
+ end
22
+
23
+ def self.extractor
24
+ return @@extractor
25
+ end
17
26
 
18
27
  ##
19
28
  # At any given point, the current document can be queried with this method; Typically used
@@ -71,6 +71,7 @@ module Scrubyt
71
71
  root_pattern
72
72
  end
73
73
  end
74
+ FetchAction.extractor = self
74
75
  context.extractor = self
75
76
  context.instance_eval(&extractor_definition)
76
77
  @evaluating_extractor_definition = false
@@ -84,9 +85,10 @@ module Scrubyt
84
85
  #Once all is set up, evaluate the extractor from the root pattern!
85
86
  root_results = evaluate_extractor
86
87
  FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
88
+
87
89
 
88
90
  @result = ScrubytResult.new('root')
89
- @result.push(*root_results)
91
+ @result.push(*@root_results)
90
92
  @result.root_patterns = @root_patterns
91
93
  @result.source_file = source_file
92
94
  @result.source_proc = extractor_definition
@@ -128,14 +130,14 @@ module Scrubyt
128
130
  end
129
131
 
130
132
  def evaluate_extractor
131
- root_results = []
133
+ @root_results ||= []
132
134
  current_page_count = 1
133
135
  catch :quit_next_page_loop do
134
136
  loop do
135
137
  url = get_current_doc_url #TODO need absolute address here 2/4
136
138
  @processed_pages << url
137
139
  @root_patterns.each do |root_pattern|
138
- root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
140
+ @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
139
141
  end
140
142
 
141
143
  while @processed_pages.include? url #TODO need absolute address here 3/4
@@ -161,7 +163,8 @@ module Scrubyt
161
163
  current_page_count += 1
162
164
  end
163
165
  end
164
- root_results
166
+ @root_patterns = []
167
+ @root_results
165
168
  end
166
169
 
167
170
  end
@@ -20,6 +20,7 @@ module Scrubyt
20
20
  end
21
21
 
22
22
  def to_s
23
+ return "" if result.nil?
23
24
  text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
25
  text = SharedUtils.unescape_entities(text)
25
26
  text.strip!
@@ -29,6 +30,10 @@ module Scrubyt
29
30
  text
30
31
  end
31
32
  end
33
+
34
+ def inspect
35
+ to_s
36
+ end
32
37
 
33
38
  def to_libxml
34
39
  libxml_node = XML::Node.new(name)
@@ -39,7 +39,7 @@ module Scrubyt
39
39
  results << node
40
40
  results.delete node.parent if node.is_a? Hpricot::Elem
41
41
  end
42
- node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
42
+ node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
43
43
  }
44
44
  traverse_for_match_inner.call(node,regexp)
45
45
  results
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrubber-scrubyt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.20
4
+ version: 0.4.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Szinek
@@ -45,7 +45,7 @@ extra_rdoc_files: []
45
45
 
46
46
  files:
47
47
  - COPYING
48
- - README
48
+ - README.rdoc
49
49
  - CHANGELOG
50
50
  - Rakefile
51
51
  - lib/scrubyt/core/navigation/agents/firewatir.rb