scrubber-scrubyt 0.4.20 → 0.4.25
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -1
- data/{README → README.rdoc} +0 -0
- data/lib/scrubyt.rb +9 -1
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +4 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +9 -0
- data/lib/scrubyt/core/shared/extractor.rb +7 -4
- data/lib/scrubyt/output/result_node.rb +5 -0
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
3
|
== 0.4.3
|
4
|
-
==
|
4
|
+
== 23th April
|
5
5
|
|
6
6
|
- [NEW] option to close the firefox window after the scraping is finished (thanks to Mikkel Garcia and Damien Garros)
|
7
|
+
- [NEW] Added the ability to scrape, click_link, then scrape again. (Only for firefox agent) (thanks to Mikkel Garcia)
|
7
8
|
- [FIX] scRUBYt! now works with latest version of mechanize (thanks to nesquena, austinmoore and Leandro Nunes)
|
9
|
+
- [NEW] added a wrapper around the firewatir requirement to make firewatir optional
|
10
|
+
- [FIX] added test to prohibit traverse_from_match from attempting to traverse nil children (thanks to Dennis Sutch)
|
11
|
+
|
12
|
+
|
8
13
|
|
9
14
|
|
10
15
|
== 0.4.05
|
data/{README → README.rdoc}
RENAMED
File without changes
|
data/lib/scrubyt.rb
CHANGED
@@ -28,7 +28,15 @@ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
|
|
28
28
|
require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
|
29
29
|
require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
30
30
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
31
|
-
|
31
|
+
|
32
|
+
# -- Making Firewatir optional --
|
33
|
+
if defined? Firewatir::Firefox
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
35
|
+
else
|
36
|
+
puts "The gem firewatir is not installed"
|
37
|
+
end
|
38
|
+
# --
|
39
|
+
|
32
40
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
33
41
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
|
34
42
|
require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
|
@@ -103,6 +103,10 @@ module Scrubyt
|
|
103
103
|
end
|
104
104
|
sleep(wait_secs) if wait_secs > 0
|
105
105
|
@@agent.wait
|
106
|
+
|
107
|
+
# evaluate the results
|
108
|
+
extractor.evaluate_extractor
|
109
|
+
|
106
110
|
@@current_doc_url = @@agent.url
|
107
111
|
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
108
112
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
@@ -14,6 +14,15 @@ module Scrubyt
|
|
14
14
|
@@host_name = nil
|
15
15
|
@@history = []
|
16
16
|
@@current_form = nil
|
17
|
+
@@extractor = nil
|
18
|
+
|
19
|
+
def self.extractor=(extractor)
|
20
|
+
@@extractor = extractor
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.extractor
|
24
|
+
return @@extractor
|
25
|
+
end
|
17
26
|
|
18
27
|
##
|
19
28
|
# At any given point, the current document can be queried with this method; Typically used
|
@@ -71,6 +71,7 @@ module Scrubyt
|
|
71
71
|
root_pattern
|
72
72
|
end
|
73
73
|
end
|
74
|
+
FetchAction.extractor = self
|
74
75
|
context.extractor = self
|
75
76
|
context.instance_eval(&extractor_definition)
|
76
77
|
@evaluating_extractor_definition = false
|
@@ -84,9 +85,10 @@ module Scrubyt
|
|
84
85
|
#Once all is set up, evaluate the extractor from the root pattern!
|
85
86
|
root_results = evaluate_extractor
|
86
87
|
FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
|
88
|
+
|
87
89
|
|
88
90
|
@result = ScrubytResult.new('root')
|
89
|
-
@result.push(
|
91
|
+
@result.push(*@root_results)
|
90
92
|
@result.root_patterns = @root_patterns
|
91
93
|
@result.source_file = source_file
|
92
94
|
@result.source_proc = extractor_definition
|
@@ -128,14 +130,14 @@ module Scrubyt
|
|
128
130
|
end
|
129
131
|
|
130
132
|
def evaluate_extractor
|
131
|
-
root_results
|
133
|
+
@root_results ||= []
|
132
134
|
current_page_count = 1
|
133
135
|
catch :quit_next_page_loop do
|
134
136
|
loop do
|
135
137
|
url = get_current_doc_url #TODO need absolute address here 2/4
|
136
138
|
@processed_pages << url
|
137
139
|
@root_patterns.each do |root_pattern|
|
138
|
-
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
140
|
+
@root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
139
141
|
end
|
140
142
|
|
141
143
|
while @processed_pages.include? url #TODO need absolute address here 3/4
|
@@ -161,7 +163,8 @@ module Scrubyt
|
|
161
163
|
current_page_count += 1
|
162
164
|
end
|
163
165
|
end
|
164
|
-
|
166
|
+
@root_patterns = []
|
167
|
+
@root_results
|
165
168
|
end
|
166
169
|
|
167
170
|
end
|
@@ -20,6 +20,7 @@ module Scrubyt
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def to_s
|
23
|
+
return "" if result.nil?
|
23
24
|
text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
|
24
25
|
text = SharedUtils.unescape_entities(text)
|
25
26
|
text.strip!
|
@@ -29,6 +30,10 @@ module Scrubyt
|
|
29
30
|
text
|
30
31
|
end
|
31
32
|
end
|
33
|
+
|
34
|
+
def inspect
|
35
|
+
to_s
|
36
|
+
end
|
32
37
|
|
33
38
|
def to_libxml
|
34
39
|
libxml_node = XML::Node.new(name)
|
@@ -39,7 +39,7 @@ module Scrubyt
|
|
39
39
|
results << node
|
40
40
|
results.delete node.parent if node.is_a? Hpricot::Elem
|
41
41
|
end
|
42
|
-
node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
|
42
|
+
node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
|
43
43
|
}
|
44
44
|
traverse_for_match_inner.call(node,regexp)
|
45
45
|
results
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrubber-scrubyt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Szinek
|
@@ -45,7 +45,7 @@ extra_rdoc_files: []
|
|
45
45
|
|
46
46
|
files:
|
47
47
|
- COPYING
|
48
|
-
- README
|
48
|
+
- README.rdoc
|
49
49
|
- CHANGELOG
|
50
50
|
- Rakefile
|
51
51
|
- lib/scrubyt/core/navigation/agents/firewatir.rb
|