scrubber-scrubyt 0.4.20 → 0.4.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,10 +1,15 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
3
  == 0.4.3
4
- == 20th April
4
+ == 23th April
5
5
 
6
6
  - [NEW] option to close the firefox window after the scraping is finished (thanks to Mikkel Garcia and Damien Garros)
7
+ - [NEW] Added the ability to scrape, click_link, then scrape again. (Only for firefox agent) (thanks to Mikkel Garcia)
7
8
  - [FIX] scRUBYt! now works with latest version of mechanize (thanks to nesquena, austinmoore and Leandro Nunes)
9
+ - [NEW] added a wrapper around the firewatir requirement to make firewatir optional
10
+ - [FIX] added test to prohibit traverse_from_match from attempting to traverse nil children (thanks to Dennis Sutch)
11
+
12
+
8
13
 
9
14
 
10
15
  == 0.4.05
File without changes
data/lib/scrubyt.rb CHANGED
@@ -28,7 +28,15 @@ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
28
28
  require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
29
29
  require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
30
30
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
31
- require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
31
+
32
+ # -- Making Firewatir optional --
33
+ if defined? Firewatir::Firefox
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
+ else
36
+ puts "The gem firewatir is not installed"
37
+ end
38
+ # --
39
+
32
40
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
33
41
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
34
42
  require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
@@ -103,6 +103,10 @@ module Scrubyt
103
103
  end
104
104
  sleep(wait_secs) if wait_secs > 0
105
105
  @@agent.wait
106
+
107
+ # evaluate the results
108
+ extractor.evaluate_extractor
109
+
106
110
  @@current_doc_url = @@agent.url
107
111
  @@mechanize_doc = "<html>#{@@agent.html}</html>"
108
112
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -14,6 +14,15 @@ module Scrubyt
14
14
  @@host_name = nil
15
15
  @@history = []
16
16
  @@current_form = nil
17
+ @@extractor = nil
18
+
19
+ def self.extractor=(extractor)
20
+ @@extractor = extractor
21
+ end
22
+
23
+ def self.extractor
24
+ return @@extractor
25
+ end
17
26
 
18
27
  ##
19
28
  # At any given point, the current document can be queried with this method; Typically used
@@ -71,6 +71,7 @@ module Scrubyt
71
71
  root_pattern
72
72
  end
73
73
  end
74
+ FetchAction.extractor = self
74
75
  context.extractor = self
75
76
  context.instance_eval(&extractor_definition)
76
77
  @evaluating_extractor_definition = false
@@ -84,9 +85,10 @@ module Scrubyt
84
85
  #Once all is set up, evaluate the extractor from the root pattern!
85
86
  root_results = evaluate_extractor
86
87
  FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
88
+
87
89
 
88
90
  @result = ScrubytResult.new('root')
89
- @result.push(*root_results)
91
+ @result.push(*@root_results)
90
92
  @result.root_patterns = @root_patterns
91
93
  @result.source_file = source_file
92
94
  @result.source_proc = extractor_definition
@@ -128,14 +130,14 @@ module Scrubyt
128
130
  end
129
131
 
130
132
  def evaluate_extractor
131
- root_results = []
133
+ @root_results ||= []
132
134
  current_page_count = 1
133
135
  catch :quit_next_page_loop do
134
136
  loop do
135
137
  url = get_current_doc_url #TODO need absolute address here 2/4
136
138
  @processed_pages << url
137
139
  @root_patterns.each do |root_pattern|
138
- root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
140
+ @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
139
141
  end
140
142
 
141
143
  while @processed_pages.include? url #TODO need absolute address here 3/4
@@ -161,7 +163,8 @@ module Scrubyt
161
163
  current_page_count += 1
162
164
  end
163
165
  end
164
- root_results
166
+ @root_patterns = []
167
+ @root_results
165
168
  end
166
169
 
167
170
  end
@@ -20,6 +20,7 @@ module Scrubyt
20
20
  end
21
21
 
22
22
  def to_s
23
+ return "" if result.nil?
23
24
  text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
24
25
  text = SharedUtils.unescape_entities(text)
25
26
  text.strip!
@@ -29,6 +30,10 @@ module Scrubyt
29
30
  text
30
31
  end
31
32
  end
33
+
34
+ def inspect
35
+ to_s
36
+ end
32
37
 
33
38
  def to_libxml
34
39
  libxml_node = XML::Node.new(name)
@@ -39,7 +39,7 @@ module Scrubyt
39
39
  results << node
40
40
  results.delete node.parent if node.is_a? Hpricot::Elem
41
41
  end
42
- node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
42
+ node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
43
43
  }
44
44
  traverse_for_match_inner.call(node,regexp)
45
45
  results
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrubber-scrubyt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.20
4
+ version: 0.4.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Szinek
@@ -45,7 +45,7 @@ extra_rdoc_files: []
45
45
 
46
46
  files:
47
47
  - COPYING
48
- - README
48
+ - README.rdoc
49
49
  - CHANGELOG
50
50
  - Rakefile
51
51
  - lib/scrubyt/core/navigation/agents/firewatir.rb