scrubber-scrubyt 0.4.20 → 0.4.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -1
- data/{README → README.rdoc} +0 -0
- data/lib/scrubyt.rb +9 -1
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +4 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +9 -0
- data/lib/scrubyt/core/shared/extractor.rb +7 -4
- data/lib/scrubyt/output/result_node.rb +5 -0
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
= scRUBYt! Changelog
|
|
2
2
|
|
|
3
3
|
== 0.4.3
|
|
4
|
-
==
|
|
4
|
+
== 23th April
|
|
5
5
|
|
|
6
6
|
- [NEW] option to close the firefox window after the scraping is finished (thanks to Mikkel Garcia and Damien Garros)
|
|
7
|
+
- [NEW] Added the ability to scrape, click_link, then scrape again. (Only for firefox agent) (thanks to Mikkel Garcia)
|
|
7
8
|
- [FIX] scRUBYt! now works with latest version of mechanize (thanks to nesquena, austinmoore and Leandro Nunes)
|
|
9
|
+
- [NEW] added a wrapper around the firewatir requirement to make firewatir optional
|
|
10
|
+
- [FIX] added test to prohibit traverse_from_match from attempting to traverse nil children (thanks to Dennis Sutch)
|
|
11
|
+
|
|
12
|
+
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
== 0.4.05
|
data/{README → README.rdoc}
RENAMED
|
File without changes
|
data/lib/scrubyt.rb
CHANGED
|
@@ -28,7 +28,15 @@ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
|
|
|
28
28
|
require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
|
|
29
29
|
require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
|
30
30
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
|
31
|
-
|
|
31
|
+
|
|
32
|
+
# -- Making Firewatir optional --
|
|
33
|
+
if defined? Firewatir::Firefox
|
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
|
35
|
+
else
|
|
36
|
+
puts "The gem firewatir is not installed"
|
|
37
|
+
end
|
|
38
|
+
# --
|
|
39
|
+
|
|
32
40
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
|
33
41
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
|
|
34
42
|
require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
|
|
@@ -103,6 +103,10 @@ module Scrubyt
|
|
|
103
103
|
end
|
|
104
104
|
sleep(wait_secs) if wait_secs > 0
|
|
105
105
|
@@agent.wait
|
|
106
|
+
|
|
107
|
+
# evaluate the results
|
|
108
|
+
extractor.evaluate_extractor
|
|
109
|
+
|
|
106
110
|
@@current_doc_url = @@agent.url
|
|
107
111
|
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
|
108
112
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
|
@@ -14,6 +14,15 @@ module Scrubyt
|
|
|
14
14
|
@@host_name = nil
|
|
15
15
|
@@history = []
|
|
16
16
|
@@current_form = nil
|
|
17
|
+
@@extractor = nil
|
|
18
|
+
|
|
19
|
+
def self.extractor=(extractor)
|
|
20
|
+
@@extractor = extractor
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.extractor
|
|
24
|
+
return @@extractor
|
|
25
|
+
end
|
|
17
26
|
|
|
18
27
|
##
|
|
19
28
|
# At any given point, the current document can be queried with this method; Typically used
|
|
@@ -71,6 +71,7 @@ module Scrubyt
|
|
|
71
71
|
root_pattern
|
|
72
72
|
end
|
|
73
73
|
end
|
|
74
|
+
FetchAction.extractor = self
|
|
74
75
|
context.extractor = self
|
|
75
76
|
context.instance_eval(&extractor_definition)
|
|
76
77
|
@evaluating_extractor_definition = false
|
|
@@ -84,9 +85,10 @@ module Scrubyt
|
|
|
84
85
|
#Once all is set up, evaluate the extractor from the root pattern!
|
|
85
86
|
root_results = evaluate_extractor
|
|
86
87
|
FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
|
|
88
|
+
|
|
87
89
|
|
|
88
90
|
@result = ScrubytResult.new('root')
|
|
89
|
-
@result.push(
|
|
91
|
+
@result.push(*@root_results)
|
|
90
92
|
@result.root_patterns = @root_patterns
|
|
91
93
|
@result.source_file = source_file
|
|
92
94
|
@result.source_proc = extractor_definition
|
|
@@ -128,14 +130,14 @@ module Scrubyt
|
|
|
128
130
|
end
|
|
129
131
|
|
|
130
132
|
def evaluate_extractor
|
|
131
|
-
root_results
|
|
133
|
+
@root_results ||= []
|
|
132
134
|
current_page_count = 1
|
|
133
135
|
catch :quit_next_page_loop do
|
|
134
136
|
loop do
|
|
135
137
|
url = get_current_doc_url #TODO need absolute address here 2/4
|
|
136
138
|
@processed_pages << url
|
|
137
139
|
@root_patterns.each do |root_pattern|
|
|
138
|
-
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
|
140
|
+
@root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
|
|
139
141
|
end
|
|
140
142
|
|
|
141
143
|
while @processed_pages.include? url #TODO need absolute address here 3/4
|
|
@@ -161,7 +163,8 @@ module Scrubyt
|
|
|
161
163
|
current_page_count += 1
|
|
162
164
|
end
|
|
163
165
|
end
|
|
164
|
-
|
|
166
|
+
@root_patterns = []
|
|
167
|
+
@root_results
|
|
165
168
|
end
|
|
166
169
|
|
|
167
170
|
end
|
|
@@ -20,6 +20,7 @@ module Scrubyt
|
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
def to_s
|
|
23
|
+
return "" if result.nil?
|
|
23
24
|
text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
|
|
24
25
|
text = SharedUtils.unescape_entities(text)
|
|
25
26
|
text.strip!
|
|
@@ -29,6 +30,10 @@ module Scrubyt
|
|
|
29
30
|
text
|
|
30
31
|
end
|
|
31
32
|
end
|
|
33
|
+
|
|
34
|
+
def inspect
|
|
35
|
+
to_s
|
|
36
|
+
end
|
|
32
37
|
|
|
33
38
|
def to_libxml
|
|
34
39
|
libxml_node = XML::Node.new(name)
|
|
@@ -39,7 +39,7 @@ module Scrubyt
|
|
|
39
39
|
results << node
|
|
40
40
|
results.delete node.parent if node.is_a? Hpricot::Elem
|
|
41
41
|
end
|
|
42
|
-
node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
|
|
42
|
+
node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
|
|
43
43
|
}
|
|
44
44
|
traverse_for_match_inner.call(node,regexp)
|
|
45
45
|
results
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scrubber-scrubyt
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.25
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Peter Szinek
|
|
@@ -45,7 +45,7 @@ extra_rdoc_files: []
|
|
|
45
45
|
|
|
46
46
|
files:
|
|
47
47
|
- COPYING
|
|
48
|
-
- README
|
|
48
|
+
- README.rdoc
|
|
49
49
|
- CHANGELOG
|
|
50
50
|
- Rakefile
|
|
51
51
|
- lib/scrubyt/core/navigation/agents/firewatir.rb
|