scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
@@ -4,27 +4,27 @@ module Scrubyt
|
|
4
4
|
#the simple example and the compound example.
|
5
5
|
#
|
6
6
|
#This class is responsible for finding elements matched by compound examples.
|
7
|
-
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
-
#(e.g. match the n-th which matches the text, or element that matches the
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
9
|
#text but also contains a specific attribute etc.)
|
10
10
|
class CompoundExampleLookup
|
11
|
-
def self.find_node_from_compund_example(doc, compound_example, next_link)
|
11
|
+
def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
|
12
12
|
@partial_results = []
|
13
|
-
self.lookup_compound_example(doc, compound_example)
|
13
|
+
self.lookup_compound_example(doc, compound_example, index)
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
private
|
17
17
|
#Lookup the first element which is matched by this compund example
|
18
18
|
#
|
19
|
-
#A compound example is specified with :contains, :begins_with and
|
19
|
+
#A compound example is specified with :contains, :begins_with and
|
20
20
|
#:ends_with descriptors - which can be both regexps or strings
|
21
21
|
#
|
22
22
|
#Example:
|
23
23
|
#
|
24
24
|
#flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
|
25
|
-
def self.lookup_compound_example(doc, compound_example)
|
25
|
+
def self.lookup_compound_example(doc, compound_example, index)
|
26
26
|
compound_example.each do |k,v|
|
27
|
-
v = Regexp.escape(v) if v.is_a? String
|
27
|
+
v = Regexp.escape(v) if v.is_a? String
|
28
28
|
case k
|
29
29
|
when :contains
|
30
30
|
v = /#{v}/
|
@@ -39,12 +39,12 @@ private
|
|
39
39
|
refine_partial_results(v)
|
40
40
|
end
|
41
41
|
end
|
42
|
-
@partial_results
|
42
|
+
@partial_results[index]
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
def self.refine_partial_results(regexp)
|
46
46
|
@partial_results = @partial_results.select {|pr| pr.inner_text =~ regexp}
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
end #End of class CompoundExampleLookup
|
50
50
|
end #End of module Scrubyt
|
@@ -0,0 +1,113 @@
|
|
1
|
+
class Module
|
2
|
+
def option_reader(key_default_hash)
|
3
|
+
key_default_hash.each do |key, default|
|
4
|
+
define_method(key) {
|
5
|
+
if @options[key].nil?
|
6
|
+
if default.is_a? Proc
|
7
|
+
instance_eval(&default)
|
8
|
+
else
|
9
|
+
default
|
10
|
+
end
|
11
|
+
else
|
12
|
+
@options[key]
|
13
|
+
end
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def option_writer(*keys)
|
19
|
+
keys.each do |key|
|
20
|
+
define_method("#{key.to_s}=".to_sym) { |value|
|
21
|
+
@options[key] = value
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def option(key, default=nil, writable=false)
|
27
|
+
option_reader(key => default)
|
28
|
+
option_writer(key) if writable
|
29
|
+
end
|
30
|
+
|
31
|
+
def option_accessor(key_default_hash)
|
32
|
+
key_default_hash.each do |key, default|
|
33
|
+
option(key, default, true)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class Range
|
39
|
+
def <=>(other)
|
40
|
+
self.begin <=> other.begin
|
41
|
+
end
|
42
|
+
|
43
|
+
def +(amount)
|
44
|
+
(self.begin + amount)..(self.end + amount)
|
45
|
+
end
|
46
|
+
|
47
|
+
def -(amount)
|
48
|
+
(self.begin - amount)..(self.end - amount)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
module Math
|
53
|
+
def self.min(a, b)
|
54
|
+
a < b ? a : b
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.max(a, b)
|
58
|
+
a > b ? a : b
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
class Array
|
63
|
+
def to_sexp
|
64
|
+
[:array, *to_sexp_array]
|
65
|
+
end
|
66
|
+
|
67
|
+
def to_sexp_array
|
68
|
+
collect { |element| element.to_sexp }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Hash
|
73
|
+
def to_sexp
|
74
|
+
[:hash, *to_sexp_array]
|
75
|
+
end
|
76
|
+
|
77
|
+
def to_sexp_array
|
78
|
+
sexp = []
|
79
|
+
each { |key, value| sexp.push(key.to_sexp, value.to_sexp) }
|
80
|
+
sexp
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class Symbol
|
85
|
+
def to_sexp
|
86
|
+
[:lit, self]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
class String
|
91
|
+
def to_sexp
|
92
|
+
[:str, self]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class TrueClass
|
97
|
+
def to_sexp
|
98
|
+
[:true]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class FalseClass
|
103
|
+
def to_sexp
|
104
|
+
[:false]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
class Proc
|
109
|
+
alias_method :parse_tree_to_sexp, :to_sexp
|
110
|
+
def to_sexp
|
111
|
+
[:iter, [:fcall, :lambda], nil, parse_tree_to_sexp[1] ]
|
112
|
+
end
|
113
|
+
end
|
@@ -1,11 +1,8 @@
|
|
1
1
|
module Scrubyt
|
2
2
|
##
|
3
|
-
#=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
|
4
|
-
#
|
3
|
+
#=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
|
4
|
+
#
|
5
5
|
class SharedUtils
|
6
|
-
#When looking up examples, do NOT recurse into these tags since they won't contain any usable info
|
7
|
-
NON_CONTENT_TAGS = ['form','option', 'input', 'script', 'noscript']
|
8
|
-
|
9
6
|
#Entities to replace - need to make this more complete, or install htmlentities or similar package
|
10
7
|
ENTITIES = {
|
11
8
|
'quot' => '"',
|
@@ -14,32 +11,48 @@ module Scrubyt
|
|
14
11
|
'lt' => '<',
|
15
12
|
'gt' => '>',
|
16
13
|
'nbsp' => ' '}
|
17
|
-
|
14
|
+
|
15
|
+
def self.prepare_text_for_comparison(text)
|
16
|
+
unescape_entities text
|
17
|
+
text.strip!
|
18
|
+
text
|
19
|
+
end
|
20
|
+
|
18
21
|
#Unescape the entities in the HTML!
|
19
22
|
def self.unescape_entities(text)
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
|
23
|
+
ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
|
24
|
+
text
|
25
|
+
end
|
26
|
+
|
24
27
|
#Entry point for finding the elements specified by examples
|
25
28
|
def self.traverse_for_match(node, regexp)
|
26
|
-
|
27
|
-
traverse_for_match_inner
|
28
|
-
|
29
|
+
results = []
|
30
|
+
traverse_for_match_inner = lambda { |node, regexp|
|
31
|
+
ft = prepare_text_for_comparison(node.inner_text)
|
32
|
+
if ft =~ regexp
|
33
|
+
node.instance_eval do
|
34
|
+
@match_data = $~
|
35
|
+
def match_data
|
36
|
+
@match_data
|
37
|
+
end
|
38
|
+
end
|
39
|
+
results << node
|
40
|
+
results.delete node.parent if node.is_a? Hpricot::Elem
|
41
|
+
end
|
42
|
+
node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
|
43
|
+
}
|
44
|
+
traverse_for_match_inner.call(node,regexp)
|
45
|
+
results
|
29
46
|
end
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
@results.delete node.parent
|
47
|
+
|
48
|
+
def self.get_backtrace
|
49
|
+
begin
|
50
|
+
raise
|
51
|
+
rescue Exception => ex
|
52
|
+
backtrace = ex.backtrace
|
37
53
|
end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end #end of method traverse_for_match
|
54
|
+
backtrace.slice!(0)
|
55
|
+
backtrace
|
56
|
+
end
|
44
57
|
end #end of class SharedUtils
|
45
58
|
end #end of module Scrubyt
|
@@ -4,8 +4,8 @@ module Scrubyt
|
|
4
4
|
#the simple example and the compound example.
|
5
5
|
#
|
6
6
|
#This class is responsible for finding elements matched by simple examples.
|
7
|
-
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
-
#(e.g. match the n-th which matches the text, or element that matches the
|
7
|
+
#In the futre probably more sophisticated matching algorithms will be added
|
8
|
+
#(e.g. match the n-th which matches the text, or element that matches the
|
9
9
|
#text but also contains a specific attribute etc.)
|
10
10
|
class SimpleExampleLookup
|
11
11
|
#From the example text defined by the user, find the lowest possible node which contains the text 'text'.
|
@@ -14,10 +14,10 @@ module Scrubyt
|
|
14
14
|
# <a>Bon <b>nuit</b>, monsieur!</a>
|
15
15
|
#
|
16
16
|
#In this case, <a>'s text is considered to be "Bon nuit, monsieur"
|
17
|
-
def self.find_node_from_text(doc, text, next_link)
|
17
|
+
def self.find_node_from_text(doc, text, next_link=false, index = 0)
|
18
18
|
text.gsub!('»', '»')
|
19
|
-
text = Regexp.escape(text) if text.is_a? String
|
20
|
-
SharedUtils.traverse_for_match(doc,/#{text}/)
|
21
|
-
end
|
19
|
+
text = Regexp.escape(text) if text.is_a? String
|
20
|
+
SharedUtils.traverse_for_match(doc,/#{text}/)[index]
|
21
|
+
end
|
22
22
|
end #End of class SimpleExampleLookup
|
23
23
|
end #End of module Scrubyt
|
@@ -4,9 +4,9 @@ require 'hpricot'
|
|
4
4
|
module Scrubyt
|
5
5
|
##
|
6
6
|
#=<tt>Various XPath utility functions</tt>
|
7
|
-
class XPathUtils
|
8
|
-
|
9
|
-
#Find the LCA (Lowest Common Ancestor) of two nodes
|
7
|
+
class XPathUtils
|
8
|
+
|
9
|
+
#Find the LCA (Lowest Common Ancestor) of two nodes
|
10
10
|
def self.lowest_common_ancestor(node1, node2)
|
11
11
|
path1 = traverse_up(node1)
|
12
12
|
path2 = traverse_up(node2)
|
@@ -19,7 +19,7 @@ module Scrubyt
|
|
19
19
|
end
|
20
20
|
path1.size > path2.size ? path1.last.parent : path2.last.parent
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
##
|
24
24
|
#Generate XPath for the given node
|
25
25
|
#
|
@@ -28,7 +28,7 @@ module Scrubyt
|
|
28
28
|
#_node_ - The node we are looking up the XPath for
|
29
29
|
#
|
30
30
|
#_stopnode_ - The Xpath generation is stopped and the XPath that
|
31
|
-
#was generated so far is returned if this node is reached.
|
31
|
+
#was generated so far is returned if this node is reached.
|
32
32
|
#
|
33
33
|
#_write_indices_ - whether the index inside the parent shuold be
|
34
34
|
#added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
|
@@ -36,7 +36,7 @@ module Scrubyt
|
|
36
36
|
path = []
|
37
37
|
indices = []
|
38
38
|
found = false
|
39
|
-
while node.class != Hpricot::Doc do
|
39
|
+
while !node.nil? && node.class != Hpricot::Doc do
|
40
40
|
if node == stopnode
|
41
41
|
found = true
|
42
42
|
break
|
@@ -53,32 +53,32 @@ module Scrubyt
|
|
53
53
|
path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
|
54
54
|
else
|
55
55
|
path.reverse.each{ |node| result += "#{node}/" }
|
56
|
-
end
|
56
|
+
end
|
57
57
|
"/" + result.chop
|
58
58
|
end
|
59
|
-
|
60
|
-
#Generate an XPath of the node with indices, relatively to the given
|
59
|
+
|
60
|
+
#Generate an XPath of the node with indices, relatively to the given
|
61
61
|
#relative_root.
|
62
62
|
#
|
63
|
-
#For example if the elem's absolute XPath is /a/b/c,
|
63
|
+
#For example if the elem's absolute XPath is /a/b/c,
|
64
64
|
#and the relative root's Xpath is a/b, the result of the function will
|
65
65
|
#be /c.
|
66
66
|
def self.generate_relative_XPath( elem,relative_root )
|
67
67
|
return nil if (elem == relative_root)
|
68
68
|
generate_XPath(elem, relative_root, true)
|
69
69
|
end
|
70
|
-
|
71
|
-
#Generate a generalized XPath (i.e. without indices) of the node,
|
70
|
+
|
71
|
+
#Generate a generalized XPath (i.e. without indices) of the node,
|
72
72
|
#relatively to the given relative_root.
|
73
73
|
#
|
74
|
-
#For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
|
74
|
+
#For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
|
75
75
|
#and the relative root's Xpath is a[1]/b[3], the result of the function will
|
76
|
-
#be /c.
|
76
|
+
#be /c.
|
77
77
|
def self.generate_generalized_relative_XPath( elem,relative_root )
|
78
78
|
return nil if (elem == relative_root)
|
79
79
|
generate_XPath(elem, relative_root, false)
|
80
80
|
end
|
81
|
-
|
81
|
+
|
82
82
|
#Find an image based on the src of the example
|
83
83
|
#
|
84
84
|
#*parameters*
|
@@ -91,7 +91,7 @@ module Scrubyt
|
|
91
91
|
#and thus can be easily pasted as an examle
|
92
92
|
#
|
93
93
|
#_index_ - there might be more images with the same src on the page -
|
94
|
-
#most typically the user will need the 0th - but if this is not the
|
94
|
+
#most typically the user will need the 0th - but if this is not the
|
95
95
|
#case, there is the possibility to override this
|
96
96
|
def self.find_image(doc, example, index=0)
|
97
97
|
(doc/"//img[@src='#{example}']")[index]
|
@@ -99,19 +99,20 @@ module Scrubyt
|
|
99
99
|
|
100
100
|
##
|
101
101
|
#Used to find the parent of a node with the given name - for example
|
102
|
-
#find the <form> node which is the parent of the <input> node
|
102
|
+
#find the <form> node which is the parent of the <input> node
|
103
103
|
def self.traverse_up_until_name(node, name)
|
104
104
|
while node.class != Hpricot::Doc do
|
105
|
+
raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
|
105
106
|
break if node.name == name
|
106
107
|
node = node.parent
|
107
108
|
end
|
108
109
|
node
|
109
110
|
end
|
110
|
-
|
111
|
+
|
111
112
|
##
|
112
113
|
#Used when automatically looking up href attributes (for detail or next links)
|
113
|
-
#If the detail pattern did not extract a link, we first look up it's
|
114
|
-
#children - and if we don't find a link, traverse up
|
114
|
+
#If the detail pattern did not extract a link, we first look up it's
|
115
|
+
#children - and if we don't find a link, traverse up
|
115
116
|
def self.find_nearest_node_with_attribute(node, attribute)
|
116
117
|
@node = nil
|
117
118
|
return node if node.is_a? Hpricot::Elem and node[attribute]
|
@@ -119,13 +120,13 @@ module Scrubyt
|
|
119
120
|
first_parent_node_with_attribute(node, attribute) if !@node
|
120
121
|
@node
|
121
122
|
end
|
122
|
-
|
123
|
+
|
123
124
|
##
|
124
125
|
#Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
|
125
126
|
#and a child one. The result of the method is the relative XPath of the node pointed to
|
126
127
|
#by the second XPath to the node pointed to by the firs XPath.
|
127
128
|
def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
|
128
|
-
original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
|
129
|
+
original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
|
129
130
|
pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
|
130
131
|
i = 0
|
131
132
|
pairs.each_with_index do |pair,index|
|
@@ -134,7 +135,7 @@ module Scrubyt
|
|
134
135
|
end
|
135
136
|
"/" + original_child_xpath_parts[i..-1].join('/')
|
136
137
|
end
|
137
|
-
|
138
|
+
|
138
139
|
private
|
139
140
|
#Find the index of the child inside the parent
|
140
141
|
#For example:
|
@@ -142,7 +143,7 @@ private
|
|
142
143
|
# tr
|
143
144
|
# / | \
|
144
145
|
# td td td
|
145
|
-
# 0 1 2
|
146
|
+
# 0 1 2
|
146
147
|
#
|
147
148
|
#The last row contains the indices of the td's from the
|
148
149
|
#tow above.
|
@@ -154,7 +155,7 @@ private
|
|
154
155
|
node.parent.children.each do |child|
|
155
156
|
if child.class == Hpricot::Elem
|
156
157
|
c += 1 if (child.name == node.name)
|
157
|
-
break if (node == child)
|
158
|
+
break if (node == child)
|
158
159
|
end
|
159
160
|
end
|
160
161
|
c
|
@@ -169,21 +170,21 @@ private
|
|
169
170
|
end
|
170
171
|
path
|
171
172
|
end
|
172
|
-
|
173
|
+
|
173
174
|
def self.first_child_node_with_attribute(node, attribute)
|
174
175
|
return if !node.instance_of? Hpricot::Elem || @node
|
175
176
|
@node = node if node.attributes[attribute]
|
176
177
|
node.children.each { |child| first_child_node_with_attribute(child, attribute) }
|
177
178
|
end
|
178
|
-
|
179
|
+
|
179
180
|
def self.first_parent_node_with_attribute(node, attribute)
|
180
181
|
return if !node.instance_of? Hpricot::Elem || @node
|
181
182
|
@node = node if node.attributes[attribute]
|
182
183
|
first_parent_node_with_attribute(node.parent, attribute)
|
183
|
-
end
|
184
|
-
|
184
|
+
end
|
185
|
+
|
185
186
|
def self.to_general_XPath(xpath)
|
186
187
|
xpath.gsub(/\[.+?\]/) {""}
|
187
|
-
end #End of method to_general_XPath
|
188
|
+
end #End of method to_general_XPath
|
188
189
|
end #End of class XPathUtils
|
189
190
|
end #End of module Scrubyt
|