scrubyt 0.2.3 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +30 -0
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +5 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
- data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/filter.rb +35 -11
- data/lib/scrubyt/core/scraping/pattern.rb +29 -22
- data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
- data/lib/scrubyt/core/shared/extractor.rb +111 -15
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
- data/lib/scrubyt/output/export.rb +69 -22
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +26 -7
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/shared_utils.rb +45 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
- data/lib/scrubyt/utils/xpathutils.rb +43 -92
- data/test/unittests/simple_example_lookup_test.rb +68 -0
- data/test/unittests/xpathutils_test.rb +0 -13
- metadata +9 -3
@@ -0,0 +1,50 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
#=<tt>Lookup of compund examples</tt>
|
3
|
+
#There are two types of string examples in scRUBYt! right now:
|
4
|
+
#the simple example and the compound example.
|
5
|
+
#
|
6
|
+
#This class is responsible for finding elements matched by compound examples.
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
|
+
#text but also contains a specific attribute etc.)
|
10
|
+
class CompoundExampleLookup
|
11
|
+
def self.find_node_from_compund_example(doc, compound_example, next_link)
|
12
|
+
@partial_results = []
|
13
|
+
self.lookup_compound_example(doc, compound_example)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
#Lookup the first element which is matched by this compund example
|
18
|
+
#
|
19
|
+
#A compound example is specified with :contains, :begins_with and
|
20
|
+
#:ends_with descriptors - which can be both regexps or strings
|
21
|
+
#
|
22
|
+
#Example:
|
23
|
+
#
|
24
|
+
#flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
|
25
|
+
def self.lookup_compound_example(doc, compound_example)
|
26
|
+
compound_example.each do |k,v|
|
27
|
+
v = Regexp.escape(v) if v.is_a? String
|
28
|
+
case k
|
29
|
+
when :contains
|
30
|
+
v = /#{v}/
|
31
|
+
when :begins_with
|
32
|
+
v = /^\s*#{v}/
|
33
|
+
when :ends_with
|
34
|
+
v = /#{v}\s*$/
|
35
|
+
end
|
36
|
+
if (@partial_results.empty?)
|
37
|
+
@partial_results = SharedUtils.traverse_for_match(doc, v)
|
38
|
+
else
|
39
|
+
refine_partial_results(v)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
@partial_results.first
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.refine_partial_results(regexp)
|
46
|
+
@partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
|
47
|
+
end
|
48
|
+
|
49
|
+
end #End of class CompoundExampleLookup
|
50
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
|
4
|
+
#
|
5
|
+
class SharedUtils
|
6
|
+
#When looking up examples, do NOT recurse into these tags since they won't contain any usable info
|
7
|
+
NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
|
8
|
+
|
9
|
+
#Entities to replace - need to make this more complete, or install htmlentities or similar package
|
10
|
+
ENTITIES = {
|
11
|
+
'quot' => '"',
|
12
|
+
'apos' => "'",
|
13
|
+
'amp' => '&',
|
14
|
+
'lt' => '<',
|
15
|
+
'gt' => '>',
|
16
|
+
'nbsp' => ' '}
|
17
|
+
|
18
|
+
#Unescape the entities in the HTML!
|
19
|
+
def self.unescape_entities(text)
|
20
|
+
ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
|
21
|
+
text
|
22
|
+
end
|
23
|
+
|
24
|
+
#Entry point for finding the elements specified by examples
|
25
|
+
def self.traverse_for_match(node, regexp)
|
26
|
+
@results = []
|
27
|
+
traverse_for_match_inner(node,regexp)
|
28
|
+
@results
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def self.traverse_for_match_inner(node, regexp)
|
33
|
+
ft = unescape_entities(node.inner_text).strip
|
34
|
+
if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
|
35
|
+
@results << node
|
36
|
+
@results.delete node.parent
|
37
|
+
end
|
38
|
+
node.children.each do |child|
|
39
|
+
if child.instance_of? Hpricot::Elem
|
40
|
+
traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end #end of method traverse_for_match
|
44
|
+
end #end of class SharedUtils
|
45
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
#=<tt>Lookup of simple examples</tt>
|
3
|
+
#There are two types of string examples in scRUBYt! right now:
|
4
|
+
#the simple example and the compound example.
|
5
|
+
#
|
6
|
+
#This class is responsible for finding elements matched by simple examples.
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
|
+
#text but also contains a specific attribute etc.)
|
10
|
+
class SimpleExampleLookup
|
11
|
+
#From the example text defined by the user, find the lowest possible node which contains the text 'text'.
|
12
|
+
#The text can be also a mixed content text, e.g.
|
13
|
+
#
|
14
|
+
# <a>Bon <b>nuit</b>, monsieur!</a>
|
15
|
+
#
|
16
|
+
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
|
+
def self.find_node_from_text(doc, text, next_link)
|
18
|
+
text.gsub!('»', '»')
|
19
|
+
text = Regexp.escape(text) if text.is_a? String
|
20
|
+
SharedUtils.traverse_for_match(doc,/#{text}/).first
|
21
|
+
end
|
22
|
+
end #End of class SimpleExampleLookup
|
23
|
+
end #End of module Scrubyt
|
@@ -4,54 +4,8 @@ require 'hpricot'
|
|
4
4
|
module Scrubyt
|
5
5
|
##
|
6
6
|
#=<tt>Various XPath utility functions</tt>
|
7
|
-
class XPathUtils
|
8
|
-
|
9
|
-
NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
|
10
|
-
ENTITIES = {
|
11
|
-
'quot' => '"',
|
12
|
-
'apos' => "'",
|
13
|
-
'amp' => '&',
|
14
|
-
'lt' => '<',
|
15
|
-
'gt' => '>',
|
16
|
-
'nbsp' => ' '}
|
17
|
-
|
18
|
-
#From the example text defined by the user, find the lowest possible node with the text 'text'.
|
19
|
-
#The text can be also a mixed content text, e.g.
|
20
|
-
#
|
21
|
-
# <a>Bon <b>nuit</b>, monsieur!</a>
|
22
|
-
#
|
23
|
-
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
24
|
-
def self.find_node_from_text(doc, text, next_link)
|
25
|
-
@node = nil
|
26
|
-
@found = false
|
27
|
-
#digg next page hack
|
28
|
-
text.gsub!('»', '»')
|
29
|
-
self.traverse_for_full_text(doc,text)
|
30
|
-
self.lowest_possible_node_with_text(@node, text) if @node != nil
|
31
|
-
if (@found == false)
|
32
|
-
#Fallback to per node text lookup
|
33
|
-
self.traverse_for_node_text(doc,text)
|
34
|
-
if (@found == false)
|
35
|
-
return nil if next_link
|
36
|
-
puts "!" * 65
|
37
|
-
puts "!!!!!! FATAL: Node for example #{text} Not found! !!!!!!"
|
38
|
-
puts "!!!!!! Please make sure you specified the example properly !!!!!!"
|
39
|
-
puts "!" * 65
|
40
|
-
exit
|
41
|
-
end
|
42
|
-
end
|
43
|
-
@node
|
44
|
-
end
|
45
|
-
|
46
|
-
#Full text of the node; this is equivalent to Hpricot's inner_text
|
47
|
-
#(? be sure to check). Will be
|
48
|
-
#replaced if Hpricot 0.5 will be released
|
49
|
-
def self.full_text(node)
|
50
|
-
result = ""
|
51
|
-
node.traverse_text { |t| result += t.to_s }
|
52
|
-
result
|
53
|
-
end
|
54
|
-
|
7
|
+
class XPathUtils
|
8
|
+
|
55
9
|
#Find the LCA (Lowest Common Ancestor) of two nodes
|
56
10
|
def self.lowest_common_ancestor(node1, node2)
|
57
11
|
path1 = traverse_up(node1)
|
@@ -71,7 +25,7 @@ module Scrubyt
|
|
71
25
|
#
|
72
26
|
#*parameters*
|
73
27
|
#
|
74
|
-
#_node_ - The node we are looking the XPath for
|
28
|
+
#_node_ - The node we are looking up the XPath for
|
75
29
|
#
|
76
30
|
#_stopnode_ - The Xpath generation is stopped and the XPath that
|
77
31
|
#was generated so far is returned if this node is reached.
|
@@ -154,7 +108,33 @@ module Scrubyt
|
|
154
108
|
node
|
155
109
|
end
|
156
110
|
|
111
|
+
##
|
112
|
+
#Used when automatically looking up href attributes (for detail or next links)
|
113
|
+
#If the detail pattern did not extract a link, we first look up it's
|
114
|
+
#children - and if we don't find a link, traverse up
|
115
|
+
def self.find_nearest_node_with_attribute(node, attribute)
|
116
|
+
@node = nil
|
117
|
+
return node if node.is_a? Hpricot::Elem and node[attribute]
|
118
|
+
first_child_node_with_attribute(node, attribute)
|
119
|
+
first_parent_node_with_attribute(node, attribute) if !@node
|
120
|
+
@node
|
121
|
+
end
|
157
122
|
|
123
|
+
##
|
124
|
+
#Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
|
125
|
+
#and a child one. The result of the method is the relative XPath of the node pointed to
|
126
|
+
#by the second XPath to the node pointed to by the firs XPath.
|
127
|
+
def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
|
128
|
+
original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
|
129
|
+
pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
|
130
|
+
i = 0
|
131
|
+
pairs.each_with_index do |pair,index|
|
132
|
+
i = index
|
133
|
+
break if pair[0] != pair[1]
|
134
|
+
end
|
135
|
+
"/" + original_child_xpath_parts[i..-1].join('/')
|
136
|
+
end
|
137
|
+
|
158
138
|
private
|
159
139
|
#Find the index of the child inside the parent
|
160
140
|
#For example:
|
@@ -189,50 +169,21 @@ private
|
|
189
169
|
end
|
190
170
|
path
|
191
171
|
end
|
192
|
-
|
193
|
-
def self.
|
194
|
-
return if @
|
195
|
-
if
|
196
|
-
|
197
|
-
if (t.to_s == text)
|
198
|
-
@found = true
|
199
|
-
@node = t.parent
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
node.children.each do |child|
|
204
|
-
if child.instance_of? Hpricot::Elem
|
205
|
-
traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
def self.traverse_for_full_text(node, text)
|
211
|
-
return if @found
|
212
|
-
if (node.instance_of? Hpricot::Elem)
|
213
|
-
ft = unescape_entities(full_text(node)).strip
|
214
|
-
#puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
|
215
|
-
if (ft == text)
|
216
|
-
@found = true
|
217
|
-
@node = node
|
218
|
-
end
|
219
|
-
end
|
220
|
-
node.children.each do |child|
|
221
|
-
if child.instance_of? Hpricot::Elem
|
222
|
-
traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
|
223
|
-
end
|
224
|
-
end
|
172
|
+
|
173
|
+
def self.first_child_node_with_attribute(node, attribute)
|
174
|
+
return if !node.instance_of? Hpricot::Elem || @node
|
175
|
+
@node = node if node.attributes[attribute]
|
176
|
+
node.children.each { |child| first_child_node_with_attribute(child, attribute) }
|
225
177
|
end
|
226
178
|
|
227
|
-
def self.
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
end #End of method lowest_possible_node_with_text
|
179
|
+
def self.first_parent_node_with_attribute(node, attribute)
|
180
|
+
return if !node.instance_of? Hpricot::Elem || @node
|
181
|
+
@node = node if node.attributes[attribute]
|
182
|
+
first_parent_node_with_attribute(node.parent, attribute)
|
183
|
+
end
|
184
|
+
|
185
|
+
def self.to_general_XPath(xpath)
|
186
|
+
xpath.gsub(/\[.+?\]/) {""}
|
187
|
+
end #End of method to_general_XPath
|
237
188
|
end #End of class XPathUtils
|
238
189
|
end #End of module Scrubyt
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'scrubyt'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class SimpleExampleLookupTest
|
5
|
+
|
6
|
+
def setup
|
7
|
+
doc1 = <<-DOC
|
8
|
+
<a>
|
9
|
+
<b>
|
10
|
+
<c/>
|
11
|
+
<d>dddd</d>
|
12
|
+
<e>
|
13
|
+
<f>fff</f>
|
14
|
+
<k>kk</k>
|
15
|
+
<j/>
|
16
|
+
<l>lll</l>
|
17
|
+
<m/>
|
18
|
+
<n>nnn</n>
|
19
|
+
<n>nnnnnn</n>
|
20
|
+
<n>
|
21
|
+
nnnnnnnnn
|
22
|
+
<q/>
|
23
|
+
<r>rrr</r>
|
24
|
+
</n>
|
25
|
+
<o>ooo</o>
|
26
|
+
<n>nnnnnnnnnnnn</n>
|
27
|
+
<p>ppp</p>
|
28
|
+
</e>
|
29
|
+
</b>
|
30
|
+
<g>ggg</g>
|
31
|
+
</a>
|
32
|
+
DOC
|
33
|
+
@doc1 = Hpricot(doc1)
|
34
|
+
@a = @doc1.children[1]
|
35
|
+
@b = @a.children[1]
|
36
|
+
@c = @b.children[1]
|
37
|
+
@d = @b.children[3]
|
38
|
+
@e = @b.children[5]
|
39
|
+
@f = @e.children[1]
|
40
|
+
@g = @a.children[@a.children.size-2]
|
41
|
+
@k = @e.children[3]
|
42
|
+
@j = @e.children[5]
|
43
|
+
@l = @e.children[7]
|
44
|
+
@m = @e.children[9]
|
45
|
+
@n_1 = @e.children[11]
|
46
|
+
@n_2 = @e.children[13]
|
47
|
+
@n_3 = @e.children[15]
|
48
|
+
@o = @e.children[17]
|
49
|
+
@n_4 = @e.children[19]
|
50
|
+
@p = @e.children[21]
|
51
|
+
@q = @n_3.children[1]
|
52
|
+
@r = @n_3.children[3]
|
53
|
+
#@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_find_node_from_text
|
57
|
+
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
|
58
|
+
assert_instance_of(Hpricot::Elem, elem)
|
59
|
+
assert_equal(elem, @f)
|
60
|
+
|
61
|
+
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
|
62
|
+
assert_equal(elem, @d)
|
63
|
+
|
64
|
+
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
|
65
|
+
assert_equal(elem, @r)
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -53,20 +53,7 @@ class XPathUtilsTest < Test::Unit::TestCase
|
|
53
53
|
@r = @n_3.children[3]
|
54
54
|
#@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
|
55
55
|
end
|
56
|
-
|
57
|
-
def test_find_node_from_text
|
58
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
|
59
|
-
assert_instance_of(Hpricot::Elem, elem)
|
60
|
-
assert_equal(elem, @f)
|
61
|
-
|
62
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
|
63
|
-
assert_equal(elem, @d)
|
64
|
-
|
65
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
|
66
|
-
assert_equal(elem, @r)
|
67
56
|
|
68
|
-
end
|
69
|
-
|
70
57
|
def test_lowest_common_ancestor
|
71
58
|
lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
|
72
59
|
lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrubyt
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.6
|
7
|
+
date: 2007-03-25 00:00:00 +01:00
|
8
8
|
summary: A powerful Web-scraping framework
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,19 +34,24 @@ files:
|
|
34
34
|
- CHANGELOG
|
35
35
|
- Rakefile
|
36
36
|
- lib/scrubyt.rb
|
37
|
+
- lib/scrubyt/utils/shared_utils.rb
|
37
38
|
- lib/scrubyt/utils/xpathutils.rb
|
39
|
+
- lib/scrubyt/utils/simple_example_lookup.rb
|
40
|
+
- lib/scrubyt/utils/compound_example_lookup.rb
|
38
41
|
- lib/scrubyt/output/result_dumper.rb
|
39
42
|
- lib/scrubyt/output/export.rb
|
40
43
|
- lib/scrubyt/output/post_processor.rb
|
41
44
|
- lib/scrubyt/output/result.rb
|
42
|
-
- lib/scrubyt/core/navigation/fetch_action.rb
|
43
45
|
- lib/scrubyt/core/navigation/navigation_actions.rb
|
46
|
+
- lib/scrubyt/core/navigation/fetch_action.rb
|
44
47
|
- lib/scrubyt/core/scraping/result_indexer.rb
|
45
48
|
- lib/scrubyt/core/scraping/constraint_adder.rb
|
46
49
|
- lib/scrubyt/core/scraping/constraint.rb
|
47
50
|
- lib/scrubyt/core/scraping/filter.rb
|
48
51
|
- lib/scrubyt/core/scraping/pattern.rb
|
49
52
|
- lib/scrubyt/core/scraping/pre_filter_document.rb
|
53
|
+
- lib/scrubyt/core/scraping/compound_example.rb
|
54
|
+
- lib/scrubyt/core/shared/u_r_i_builder.rb
|
50
55
|
- lib/scrubyt/core/shared/evaluation_context.rb
|
51
56
|
- lib/scrubyt/core/shared/extractor.rb
|
52
57
|
test_files:
|
@@ -56,6 +61,7 @@ test_files:
|
|
56
61
|
- test/unittests/extractor_test.rb
|
57
62
|
- test/unittests/xpathutils_test.rb
|
58
63
|
- test/unittests/constraint_test.rb
|
64
|
+
- test/unittests/simple_example_lookup_test.rb
|
59
65
|
- test/unittests/input/constraint_test.html
|
60
66
|
- test/unittests/input/test.html
|
61
67
|
rdoc_options: []
|