scrubyt 0.2.3 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +30 -0
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +5 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
- data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/filter.rb +35 -11
- data/lib/scrubyt/core/scraping/pattern.rb +29 -22
- data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
- data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
- data/lib/scrubyt/core/shared/extractor.rb +111 -15
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
- data/lib/scrubyt/output/export.rb +69 -22
- data/lib/scrubyt/output/result.rb +1 -0
- data/lib/scrubyt/output/result_dumper.rb +26 -7
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/shared_utils.rb +45 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
- data/lib/scrubyt/utils/xpathutils.rb +43 -92
- data/test/unittests/simple_example_lookup_test.rb +68 -0
- data/test/unittests/xpathutils_test.rb +0 -13
- metadata +9 -3
@@ -0,0 +1,50 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
#=<tt>Lookup of compund examples</tt>
|
3
|
+
#There are two types of string examples in scRUBYt! right now:
|
4
|
+
#the simple example and the compound example.
|
5
|
+
#
|
6
|
+
#This class is responsible for finding elements matched by compound examples.
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
|
+
#text but also contains a specific attribute etc.)
|
10
|
+
class CompoundExampleLookup
|
11
|
+
def self.find_node_from_compund_example(doc, compound_example, next_link)
|
12
|
+
@partial_results = []
|
13
|
+
self.lookup_compound_example(doc, compound_example)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
#Lookup the first element which is matched by this compund example
|
18
|
+
#
|
19
|
+
#A compound example is specified with :contains, :begins_with and
|
20
|
+
#:ends_with descriptors - which can be both regexps or strings
|
21
|
+
#
|
22
|
+
#Example:
|
23
|
+
#
|
24
|
+
#flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
|
25
|
+
def self.lookup_compound_example(doc, compound_example)
|
26
|
+
compound_example.each do |k,v|
|
27
|
+
v = Regexp.escape(v) if v.is_a? String
|
28
|
+
case k
|
29
|
+
when :contains
|
30
|
+
v = /#{v}/
|
31
|
+
when :begins_with
|
32
|
+
v = /^\s*#{v}/
|
33
|
+
when :ends_with
|
34
|
+
v = /#{v}\s*$/
|
35
|
+
end
|
36
|
+
if (@partial_results.empty?)
|
37
|
+
@partial_results = SharedUtils.traverse_for_match(doc, v)
|
38
|
+
else
|
39
|
+
refine_partial_results(v)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
@partial_results.first
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.refine_partial_results(regexp)
|
46
|
+
@partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
|
47
|
+
end
|
48
|
+
|
49
|
+
end #End of class CompoundExampleLookup
|
50
|
+
end #End of module Scrubyt
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
##
|
3
|
+
#=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
|
4
|
+
#
|
5
|
+
class SharedUtils
|
6
|
+
#When looking up examples, do NOT recurse into these tags since they won't contain any usable info
|
7
|
+
NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
|
8
|
+
|
9
|
+
#Entities to replace - need to make this more complete, or install htmlentities or similar package
|
10
|
+
ENTITIES = {
|
11
|
+
'quot' => '"',
|
12
|
+
'apos' => "'",
|
13
|
+
'amp' => '&',
|
14
|
+
'lt' => '<',
|
15
|
+
'gt' => '>',
|
16
|
+
'nbsp' => ' '}
|
17
|
+
|
18
|
+
#Unescape the entities in the HTML!
|
19
|
+
def self.unescape_entities(text)
|
20
|
+
ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
|
21
|
+
text
|
22
|
+
end
|
23
|
+
|
24
|
+
#Entry point for finding the elements specified by examples
|
25
|
+
def self.traverse_for_match(node, regexp)
|
26
|
+
@results = []
|
27
|
+
traverse_for_match_inner(node,regexp)
|
28
|
+
@results
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def self.traverse_for_match_inner(node, regexp)
|
33
|
+
ft = unescape_entities(node.inner_text).strip
|
34
|
+
if ((ft =~ regexp) && (node.is_a? Hpricot::Elem))
|
35
|
+
@results << node
|
36
|
+
@results.delete node.parent
|
37
|
+
end
|
38
|
+
node.children.each do |child|
|
39
|
+
if child.instance_of? Hpricot::Elem
|
40
|
+
traverse_for_match_inner(child, regexp) unless NON_CONTENT_TAGS.include? child.name
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end #end of method traverse_for_match
|
44
|
+
end #end of class SharedUtils
|
45
|
+
end #end of module Scrubyt
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scrubyt
|
2
|
+
#=<tt>Lookup of simple examples</tt>
|
3
|
+
#There are two types of string examples in scRUBYt! right now:
|
4
|
+
#the simple example and the compound example.
|
5
|
+
#
|
6
|
+
#This class is responsible for finding elements matched by simple examples.
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
|
+
#text but also contains a specific attribute etc.)
|
10
|
+
class SimpleExampleLookup
|
11
|
+
#From the example text defined by the user, find the lowest possible node which contains the text 'text'.
|
12
|
+
#The text can be also a mixed content text, e.g.
|
13
|
+
#
|
14
|
+
# <a>Bon <b>nuit</b>, monsieur!</a>
|
15
|
+
#
|
16
|
+
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
|
+
def self.find_node_from_text(doc, text, next_link)
|
18
|
+
text.gsub!('»', '»')
|
19
|
+
text = Regexp.escape(text) if text.is_a? String
|
20
|
+
SharedUtils.traverse_for_match(doc,/#{text}/).first
|
21
|
+
end
|
22
|
+
end #End of class SimpleExampleLookup
|
23
|
+
end #End of module Scrubyt
|
@@ -4,54 +4,8 @@ require 'hpricot'
|
|
4
4
|
module Scrubyt
|
5
5
|
##
|
6
6
|
#=<tt>Various XPath utility functions</tt>
|
7
|
-
class XPathUtils
|
8
|
-
|
9
|
-
NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
|
10
|
-
ENTITIES = {
|
11
|
-
'quot' => '"',
|
12
|
-
'apos' => "'",
|
13
|
-
'amp' => '&',
|
14
|
-
'lt' => '<',
|
15
|
-
'gt' => '>',
|
16
|
-
'nbsp' => ' '}
|
17
|
-
|
18
|
-
#From the example text defined by the user, find the lowest possible node with the text 'text'.
|
19
|
-
#The text can be also a mixed content text, e.g.
|
20
|
-
#
|
21
|
-
# <a>Bon <b>nuit</b>, monsieur!</a>
|
22
|
-
#
|
23
|
-
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
24
|
-
def self.find_node_from_text(doc, text, next_link)
|
25
|
-
@node = nil
|
26
|
-
@found = false
|
27
|
-
#digg next page hack
|
28
|
-
text.gsub!('»', '»')
|
29
|
-
self.traverse_for_full_text(doc,text)
|
30
|
-
self.lowest_possible_node_with_text(@node, text) if @node != nil
|
31
|
-
if (@found == false)
|
32
|
-
#Fallback to per node text lookup
|
33
|
-
self.traverse_for_node_text(doc,text)
|
34
|
-
if (@found == false)
|
35
|
-
return nil if next_link
|
36
|
-
puts "!" * 65
|
37
|
-
puts "!!!!!! FATAL: Node for example #{text} Not found! !!!!!!"
|
38
|
-
puts "!!!!!! Please make sure you specified the example properly !!!!!!"
|
39
|
-
puts "!" * 65
|
40
|
-
exit
|
41
|
-
end
|
42
|
-
end
|
43
|
-
@node
|
44
|
-
end
|
45
|
-
|
46
|
-
#Full text of the node; this is equivalent to Hpricot's inner_text
|
47
|
-
#(? be sure to check). Will be
|
48
|
-
#replaced if Hpricot 0.5 will be released
|
49
|
-
def self.full_text(node)
|
50
|
-
result = ""
|
51
|
-
node.traverse_text { |t| result += t.to_s }
|
52
|
-
result
|
53
|
-
end
|
54
|
-
|
7
|
+
class XPathUtils
|
8
|
+
|
55
9
|
#Find the LCA (Lowest Common Ancestor) of two nodes
|
56
10
|
def self.lowest_common_ancestor(node1, node2)
|
57
11
|
path1 = traverse_up(node1)
|
@@ -71,7 +25,7 @@ module Scrubyt
|
|
71
25
|
#
|
72
26
|
#*parameters*
|
73
27
|
#
|
74
|
-
#_node_ - The node we are looking the XPath for
|
28
|
+
#_node_ - The node we are looking up the XPath for
|
75
29
|
#
|
76
30
|
#_stopnode_ - The Xpath generation is stopped and the XPath that
|
77
31
|
#was generated so far is returned if this node is reached.
|
@@ -154,7 +108,33 @@ module Scrubyt
|
|
154
108
|
node
|
155
109
|
end
|
156
110
|
|
111
|
+
##
|
112
|
+
#Used when automatically looking up href attributes (for detail or next links)
|
113
|
+
#If the detail pattern did not extract a link, we first look up it's
|
114
|
+
#children - and if we don't find a link, traverse up
|
115
|
+
def self.find_nearest_node_with_attribute(node, attribute)
|
116
|
+
@node = nil
|
117
|
+
return node if node.is_a? Hpricot::Elem and node[attribute]
|
118
|
+
first_child_node_with_attribute(node, attribute)
|
119
|
+
first_parent_node_with_attribute(node, attribute) if !@node
|
120
|
+
@node
|
121
|
+
end
|
157
122
|
|
123
|
+
##
|
124
|
+
#Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
|
125
|
+
#and a child one. The result of the method is the relative XPath of the node pointed to
|
126
|
+
#by the second XPath to the node pointed to by the firs XPath.
|
127
|
+
def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
|
128
|
+
original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
|
129
|
+
pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
|
130
|
+
i = 0
|
131
|
+
pairs.each_with_index do |pair,index|
|
132
|
+
i = index
|
133
|
+
break if pair[0] != pair[1]
|
134
|
+
end
|
135
|
+
"/" + original_child_xpath_parts[i..-1].join('/')
|
136
|
+
end
|
137
|
+
|
158
138
|
private
|
159
139
|
#Find the index of the child inside the parent
|
160
140
|
#For example:
|
@@ -189,50 +169,21 @@ private
|
|
189
169
|
end
|
190
170
|
path
|
191
171
|
end
|
192
|
-
|
193
|
-
def self.
|
194
|
-
return if @
|
195
|
-
if
|
196
|
-
|
197
|
-
if (t.to_s == text)
|
198
|
-
@found = true
|
199
|
-
@node = t.parent
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
node.children.each do |child|
|
204
|
-
if child.instance_of? Hpricot::Elem
|
205
|
-
traverse_for_node_text(child, text) unless NON_CONTENT_TAGS.include? child.name
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
def self.traverse_for_full_text(node, text)
|
211
|
-
return if @found
|
212
|
-
if (node.instance_of? Hpricot::Elem)
|
213
|
-
ft = unescape_entities(full_text(node)).strip
|
214
|
-
#puts ">>#{ft}<<, text:>>#{text}<<" if ((ft.size < 20) && ft =~ /Open Source/)
|
215
|
-
if (ft == text)
|
216
|
-
@found = true
|
217
|
-
@node = node
|
218
|
-
end
|
219
|
-
end
|
220
|
-
node.children.each do |child|
|
221
|
-
if child.instance_of? Hpricot::Elem
|
222
|
-
traverse_for_full_text(child, text) unless NON_CONTENT_TAGS.include? child.name
|
223
|
-
end
|
224
|
-
end
|
172
|
+
|
173
|
+
def self.first_child_node_with_attribute(node, attribute)
|
174
|
+
return if !node.instance_of? Hpricot::Elem || @node
|
175
|
+
@node = node if node.attributes[attribute]
|
176
|
+
node.children.each { |child| first_child_node_with_attribute(child, attribute) }
|
225
177
|
end
|
226
178
|
|
227
|
-
def self.
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
end #End of method lowest_possible_node_with_text
|
179
|
+
def self.first_parent_node_with_attribute(node, attribute)
|
180
|
+
return if !node.instance_of? Hpricot::Elem || @node
|
181
|
+
@node = node if node.attributes[attribute]
|
182
|
+
first_parent_node_with_attribute(node.parent, attribute)
|
183
|
+
end
|
184
|
+
|
185
|
+
def self.to_general_XPath(xpath)
|
186
|
+
xpath.gsub(/\[.+?\]/) {""}
|
187
|
+
end #End of method to_general_XPath
|
237
188
|
end #End of class XPathUtils
|
238
189
|
end #End of module Scrubyt
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'scrubyt'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class SimpleExampleLookupTest
|
5
|
+
|
6
|
+
def setup
|
7
|
+
doc1 = <<-DOC
|
8
|
+
<a>
|
9
|
+
<b>
|
10
|
+
<c/>
|
11
|
+
<d>dddd</d>
|
12
|
+
<e>
|
13
|
+
<f>fff</f>
|
14
|
+
<k>kk</k>
|
15
|
+
<j/>
|
16
|
+
<l>lll</l>
|
17
|
+
<m/>
|
18
|
+
<n>nnn</n>
|
19
|
+
<n>nnnnnn</n>
|
20
|
+
<n>
|
21
|
+
nnnnnnnnn
|
22
|
+
<q/>
|
23
|
+
<r>rrr</r>
|
24
|
+
</n>
|
25
|
+
<o>ooo</o>
|
26
|
+
<n>nnnnnnnnnnnn</n>
|
27
|
+
<p>ppp</p>
|
28
|
+
</e>
|
29
|
+
</b>
|
30
|
+
<g>ggg</g>
|
31
|
+
</a>
|
32
|
+
DOC
|
33
|
+
@doc1 = Hpricot(doc1)
|
34
|
+
@a = @doc1.children[1]
|
35
|
+
@b = @a.children[1]
|
36
|
+
@c = @b.children[1]
|
37
|
+
@d = @b.children[3]
|
38
|
+
@e = @b.children[5]
|
39
|
+
@f = @e.children[1]
|
40
|
+
@g = @a.children[@a.children.size-2]
|
41
|
+
@k = @e.children[3]
|
42
|
+
@j = @e.children[5]
|
43
|
+
@l = @e.children[7]
|
44
|
+
@m = @e.children[9]
|
45
|
+
@n_1 = @e.children[11]
|
46
|
+
@n_2 = @e.children[13]
|
47
|
+
@n_3 = @e.children[15]
|
48
|
+
@o = @e.children[17]
|
49
|
+
@n_4 = @e.children[19]
|
50
|
+
@p = @e.children[21]
|
51
|
+
@q = @n_3.children[1]
|
52
|
+
@r = @n_3.children[3]
|
53
|
+
#@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_find_node_from_text
|
57
|
+
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
|
58
|
+
assert_instance_of(Hpricot::Elem, elem)
|
59
|
+
assert_equal(elem, @f)
|
60
|
+
|
61
|
+
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
|
62
|
+
assert_equal(elem, @d)
|
63
|
+
|
64
|
+
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
|
65
|
+
assert_equal(elem, @r)
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -53,20 +53,7 @@ class XPathUtilsTest < Test::Unit::TestCase
|
|
53
53
|
@r = @n_3.children[3]
|
54
54
|
#@doc2 = Hpricot(open(File.join(File.dirname(__FILE__), "test.html")))
|
55
55
|
end
|
56
|
-
|
57
|
-
def test_find_node_from_text
|
58
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"fff", false)
|
59
|
-
assert_instance_of(Hpricot::Elem, elem)
|
60
|
-
assert_equal(elem, @f)
|
61
|
-
|
62
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"dddd", false)
|
63
|
-
assert_equal(elem, @d)
|
64
|
-
|
65
|
-
elem = Scrubyt::XPathUtils.find_node_from_text(@doc1,"rrr", false)
|
66
|
-
assert_equal(elem, @r)
|
67
56
|
|
68
|
-
end
|
69
|
-
|
70
57
|
def test_lowest_common_ancestor
|
71
58
|
lca_b_g = Scrubyt::XPathUtils.lowest_common_ancestor(@b,@g)
|
72
59
|
lca_f_d = Scrubyt::XPathUtils.lowest_common_ancestor(@f,@d)
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrubyt
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.6
|
7
|
+
date: 2007-03-25 00:00:00 +01:00
|
8
8
|
summary: A powerful Web-scraping framework
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,19 +34,24 @@ files:
|
|
34
34
|
- CHANGELOG
|
35
35
|
- Rakefile
|
36
36
|
- lib/scrubyt.rb
|
37
|
+
- lib/scrubyt/utils/shared_utils.rb
|
37
38
|
- lib/scrubyt/utils/xpathutils.rb
|
39
|
+
- lib/scrubyt/utils/simple_example_lookup.rb
|
40
|
+
- lib/scrubyt/utils/compound_example_lookup.rb
|
38
41
|
- lib/scrubyt/output/result_dumper.rb
|
39
42
|
- lib/scrubyt/output/export.rb
|
40
43
|
- lib/scrubyt/output/post_processor.rb
|
41
44
|
- lib/scrubyt/output/result.rb
|
42
|
-
- lib/scrubyt/core/navigation/fetch_action.rb
|
43
45
|
- lib/scrubyt/core/navigation/navigation_actions.rb
|
46
|
+
- lib/scrubyt/core/navigation/fetch_action.rb
|
44
47
|
- lib/scrubyt/core/scraping/result_indexer.rb
|
45
48
|
- lib/scrubyt/core/scraping/constraint_adder.rb
|
46
49
|
- lib/scrubyt/core/scraping/constraint.rb
|
47
50
|
- lib/scrubyt/core/scraping/filter.rb
|
48
51
|
- lib/scrubyt/core/scraping/pattern.rb
|
49
52
|
- lib/scrubyt/core/scraping/pre_filter_document.rb
|
53
|
+
- lib/scrubyt/core/scraping/compound_example.rb
|
54
|
+
- lib/scrubyt/core/shared/u_r_i_builder.rb
|
50
55
|
- lib/scrubyt/core/shared/evaluation_context.rb
|
51
56
|
- lib/scrubyt/core/shared/extractor.rb
|
52
57
|
test_files:
|
@@ -56,6 +61,7 @@ test_files:
|
|
56
61
|
- test/unittests/extractor_test.rb
|
57
62
|
- test/unittests/xpathutils_test.rb
|
58
63
|
- test/unittests/constraint_test.rb
|
64
|
+
- test/unittests/simple_example_lookup_test.rb
|
59
65
|
- test/unittests/input/constraint_test.html
|
60
66
|
- test/unittests/input/test.html
|
61
67
|
rdoc_options: []
|